diff --git a/dnn/src/common/pooling.cpp b/dnn/src/common/pooling.cpp
index 597fc30c33cca0998d6a8c8111082aab30eded0a..32a11d4e99ea3f8cfcf4d287e029a56cad5dfe10 100644
--- a/dnn/src/common/pooling.cpp
+++ b/dnn/src/common/pooling.cpp
@@ -47,7 +47,8 @@ void PoolingBase::deduce_layout_fwd(const TensorLayout& src,
     } else if (param().format == Param::Format::NCHW4 ||
                param().format == Param::Format::NCHW44 ||
                param().format == Param::Format::NCHW88 ||
-               param().format == Param::Format::NCHW32) {
+               param().format == Param::Format::NCHW32 ||
+               param().format == Param::Format::NCHW64) {
         megdnn_assert(src.ndim == 5_z, "%s", errmsg_c);
 
         spatial_pos = 2;
@@ -82,6 +83,9 @@ void PoolingBase::deduce_layout_fwd(const TensorLayout& src,
     if (param().format == Param::Format::NCHW32) {
         c *= 32;
     }
+    if (param().format == Param::Format::NCHW64) {
+        c *= 64;
+    }
     size_t oh, ow;
     size_t fh = this->param().window_h;
     size_t fw = this->param().window_w;
@@ -109,6 +113,8 @@ void PoolingBase::deduce_layout_fwd(const TensorLayout& src,
         dst = TensorLayout{{n, c / 8, oh, ow, 8}, src.dtype, src.format};
     } else if (param().format == Param::Format::NCHW32) {
         dst = TensorLayout{{n, c / 32, oh, ow, 32}, src.dtype, src.format};
+    } else if (param().format == Param::Format::NCHW64) {
+        dst = TensorLayout{{n, c / 64, oh, ow, 64}, src.dtype, src.format};
     } else if (param().format == Param::Format::CHWN4) {
         dst = TensorLayout{{c / 4, oh, ow, n, 4}, src.dtype, src.format};
     } else {
diff --git a/dnn/src/cuda/pooling/opr_impl.cpp b/dnn/src/cuda/pooling/opr_impl.cpp
index 89fbcb918a761b8739fff77a4d07f06844d48621..e88dfc61382607a3993885c2661e18e9f4ce1c29 100644
--- a/dnn/src/cuda/pooling/opr_impl.cpp
+++ b/dnn/src/cuda/pooling/opr_impl.cpp
@@ -9,13 +9,50 @@
  * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  */
 #include "src/cuda/pooling/opr_impl.h"
+#include "src/cuda/relayout_format/opr_impl.h"
 
-#include "./pooling2d_int8.cuh"
+#include "./pooling2d_qint.cuh"
 #include "src/cuda/utils.h"
 
 namespace megdnn {
 namespace cuda {
 
+namespace {
+inline void deduce_reformat_layout(std::unique_ptr<RelayoutFormat>& relayout,
+                                   const TensorLayout& src_layout,
+                                   TensorLayout& dst_layout,
+                                   RelayoutFormat::Param::Mode mode,
+                                   const int oc = 0, const int group = 1) {
+    if (src_layout.ndim > 0) {
+        RelayoutFormat::Param trans_param;
+        trans_param.mode = mode;
+        trans_param.oc = oc;
+        trans_param.group = group;
+        relayout->param() = trans_param;
+        relayout->deduce_layout(src_layout, dst_layout);
+    } else {
+        dst_layout = src_layout;
+    }
+}
+
+void get_inner_layout(const TensorLayout& src, const TensorLayout& dst,
+                      TensorLayout& inner_src, TensorLayout& inner_dst,
+                      Handle* handle,
+                      PoolingForwardImpl::Param::Format format) {
+    bool is_nchw = format == PoolingForwardImpl::Param::Format::NCHW;
+    if (src.dtype.enumv() == DTypeEnum::QuantizedS4 &&
+        dst.dtype.enumv() == DTypeEnum::QuantizedS4 && is_nchw) {
+        auto relayout_opr = handle->create_operator<RelayoutFormat>();
+        deduce_reformat_layout(relayout_opr, src, inner_src,
+                               RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1);
+        deduce_reformat_layout(relayout_opr, dst, inner_dst,
+                               RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1);
+    } else {
+        megdnn_assert(0, "not support");
+    }
+}
+
+}  // namespace
 void PoolingForwardImpl::setup_descs(const TensorLayout& src,
                                      const TensorLayout& dst) {
     src_desc.set(src, param().format);
@@ -28,14 +65,22 @@ WorkspaceBundle PoolingForwardImpl::get_workspace_bundle(
     SmallVector<size_t> sizes;
     TensorLayout fsrc = src;
     TensorLayout fdst = dst;
-    auto get_workspace = [&sizes](TensorLayout& layout) {
-        if (layout.dtype == dtype::BFloat16()) {
-            layout.dtype = dtype::Float32();
-            sizes.push_back(layout.span().dist_byte());
-        }
-    };
-    get_workspace(fsrc);
-    get_workspace(fdst);
+    bool is_nchw = param().format == Param::Format::NCHW;
+    if (src.dtype.enumv() == DTypeEnum::QuantizedS4 &&
+        dst.dtype.enumv() == DTypeEnum::QuantizedS4 && is_nchw) {
+        get_inner_layout(src, dst, fsrc, fdst, handle(), param().format);
+        sizes.push_back(fsrc.span().dist_byte());
+        sizes.push_back(fdst.span().dist_byte());
+    } else {
+        auto get_workspace = [&sizes](TensorLayout& layout) {
+            if (layout.dtype == dtype::BFloat16()) {
+                layout.dtype = dtype::Float32();
+                sizes.push_back(layout.span().dist_byte());
+            }
+        };
+        get_workspace(fsrc);
+        get_workspace(fdst);
+    }
     return {ptr, std::move(sizes)};
 }
 
@@ -44,12 +89,27 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst,
     check_exec(ssrc.layout, sdst.layout, sworkspace.size);
     TensorND src = ssrc;
     TensorND dst = sdst;
+    Param::Format inner_format = param().format;
     auto wsb =
             get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout, sdst.layout);
     auto ctypecvt = CompTypeCvter<dtype::BFloat16, dtype::Float32>(
             concrete_handle(this->handle()), &wsb);
+    bool is_nchw = param().format == Param::Format::NCHW;
     if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
         ctypecvt.src_to_comp_type(ssrc, src).src_to_comp_type(sdst, dst);
+    } else if (ssrc.layout.dtype.enumv() == DTypeEnum::QuantizedS4 &&
+               sdst.layout.dtype.enumv() == DTypeEnum::QuantizedS4 && is_nchw) {
+        auto handle_ptr = handle();
+        get_inner_layout(ssrc.layout, sdst.layout, src.layout, dst.layout,
+                         handle_ptr, param().format);
+        src.raw_ptr = wsb.get(0);
+        dst.raw_ptr = wsb.get(1);
+        auto relayout_opr = handle_ptr->create_operator<RelayoutFormat>();
+        RelayoutFormat::Param trans_param;
+        trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64;
+        relayout_opr->param() = trans_param;
+        relayout_opr->exec(ssrc, src, {});
+        inner_format = Param::Format::NCHW64;
     }
     {
         using Format = param::Pooling::Format;
@@ -104,6 +164,34 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst,
             return pooling2d::do_pooling2d_int8_ncdiv32hw32(
                     src.compatible_ptr<int8_t>(), dst.compatible_ptr<int8_t>(),
                     kern_param, stream, static_cast<uint32_t>(param().mode));
+        } else if (param().format == Format::NCHW64 ||
+                   inner_format == Format::NCHW64) {
+            megdnn_assert(src.layout.dtype.enumv() == DTypeEnum::QuantizedS4,
+                          "but %s", src.layout.dtype.name());
+            pooling2d::Param kern_param;
+            size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3],
+                   c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3];
+            c = c * 64;
+            size_t ph = param().pad_h, pw = param().pad_w;
+            size_t window_h = param().window_h, window_w = param().window_w;
+            size_t sh = param().stride_h, sw = param().stride_w;
+            kern_param.n = n, kern_param.c = c, kern_param.hi = hi,
+            kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo,
+            kern_param.ph = ph, kern_param.pw = pw,
+            kern_param.window_h = window_h, kern_param.window_w = window_w,
+            kern_param.sh = sh, kern_param.sw = sw;
+            auto&& stream = cuda_stream(handle());
+            pooling2d::do_pooling2d_int4_ncdiv64hw64(
+                    (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param,
+                    stream, static_cast<uint32_t>(param().mode));
+             if (sdst.layout.ndim == 4) {
+                 auto relayout_opr = handle()->create_operator<RelayoutFormat>();
+                 RelayoutFormat::Param trans_param;
+                 trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW;
+                 relayout_opr->param() = trans_param;
+                 relayout_opr->exec(dst, sdst,{});
+             }
+             return;
         }
         auto handle = cudnn_handle(this->handle());
         setup_descs(src.layout, dst.layout);
@@ -114,7 +202,7 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst,
     }
     if (ssrc.layout.dtype.enumv() == DTypeTrait<dtype::BFloat16>::enumv) {
         ctypecvt.comp_to_dst_type(dst, sdst);
-    }
+    } 
 }
 
 void PoolingBackwardImpl::setup_descs(const TensorLayout& src,
diff --git a/dnn/src/cuda/pooling/pooling2d_int8.cu b/dnn/src/cuda/pooling/pooling2d_int8.cu
deleted file mode 100644
index d8b12dadac940e3a3397cd15975b0eba28c10345..0000000000000000000000000000000000000000
--- a/dnn/src/cuda/pooling/pooling2d_int8.cu
+++ /dev/null
@@ -1,592 +0,0 @@
-/**
- * \file dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cu
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-#include "./pooling2d_int8.cuh"
-#include "src/common/opr_param_defs_enumv.cuh"
-#include "src/cuda/query_blocksize.cuh"
-
-using namespace megdnn;
-using namespace cuda;
-using namespace pooling2d;
-
-namespace {
-// common macros
-#define FEED1 Base::feed(x, 0);
-#define FEED2           \
-    Base::feed(x.x, 0); \
-    Base::feed(x.y, 4);
-#define FEED4           \
-    FEED2;              \
-    Base::feed(x.z, 8); \
-    Base::feed(x.w, 12);
-
-#define ANS1(cb) cb(Base::res[0], Base::res[1], Base::res[2], Base::res[3], i1);
-
-#define ANS2(cb) \
-    ANS1(cb);    \
-    cb(Base::res[4], Base::res[5], Base::res[6], Base::res[7], i2);
-
-#define ANS4(cb)                                                      \
-    ANS2(cb);                                                         \
-    cb(Base::res[8], Base::res[9], Base::res[10], Base::res[11], i3); \
-    cb(Base::res[12], Base::res[13], Base::res[14], Base::res[15], i4);
-
-__device__ __forceinline__ int pack_int8_to_int8x4(int8_t x, int8_t y, int8_t z,
-                                                   int8_t w) {
-    int ix = static_cast<int>(x), iy = static_cast<int>(y),
-        iz = static_cast<int>(z), iw = static_cast<int>(w);
-
-    asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(ix) : "r"(iy));
-    asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(iz) : "r"(iw));
-    asm volatile("prmt.b32 %0, %0, %1, 0x5410;" : "+r"(ix) : "r"(iz));
-    return ix;
-}
-
-template <typename src_type, typename feed_type>
-struct MaxPoolerBase;
-
-template <typename feed_type>
-struct MaxPoolerBase<int8_t, feed_type> {
-    static constexpr int nr_results = sizeof(feed_type) / sizeof(int8_t);
-    int8_t res[nr_results];
-
-    __device__ MaxPoolerBase(int) {}
-    __device__ __forceinline__ void init() {
-#pragma unroll
-        for (int i = 0; i < nr_results; ++i) {
-            res[i] = -128;
-        }
-    }
-    __device__ __forceinline__ void feed(int32_t x, int idx) {
-        int8_t ix = (x & 0xff);
-        int8_t iy = ((x >> 8) & 0xff);
-        int8_t iz = ((x >> 16) & 0xff);
-        int8_t iw = ((x >> 24) & 0xff);
-        res[idx] = res[idx] > ix ? res[idx] : ix;
-        res[idx + 1] = res[idx + 1] > iy ? res[idx + 1] : iy;
-        res[idx + 2] = res[idx + 2] > iz ? res[idx + 2] : iz;
-        res[idx + 3] = res[idx + 3] > iw ? res[idx + 3] : iw;
-    }
-};
-
-template <typename src_type, typename feed_type>
-struct MaxPooler;
-
-#define SPEC_WITH_FEED_TYPE(_feed_type) \
-    template <>                         \
-    struct MaxPooler<int8_t, _feed_type> : MaxPoolerBase<int8_t, _feed_type>
-
-#define COMMON_DEFS(_feed_type)                     \
-    using feed_type = _feed_type;                   \
-    using Base = MaxPoolerBase<int8_t, _feed_type>; \
-    using MaxPoolerBase<int8_t, _feed_type>::MaxPoolerBase;
-
-#define cb(_x, _y, _z, _w, _ret) \
-    { _ret = pack_int8_to_int8x4(_x, _y, _z, _w); }
-
-SPEC_WITH_FEED_TYPE(int32_t) {
-    COMMON_DEFS(int32_t);
-    __device__ __forceinline__ void feed(int32_t x) { FEED1; }
-
-    __device__ __forceinline__ int get_ans() {
-        int i1;
-        ANS1(cb);
-        return i1;
-    }
-};
-
-SPEC_WITH_FEED_TYPE(int2) {
-    COMMON_DEFS(int2);
-    __device__ __forceinline__ void feed(int2 x) { FEED2; }
-    __device__ __forceinline__ int2 get_ans() {
-        int i1, i2;
-        ANS2(cb);
-        return ::make_int2(i1, i2);
-    }
-};
-
-SPEC_WITH_FEED_TYPE(int4) {
-    COMMON_DEFS(int4);
-    __device__ __forceinline__ void feed(int4 x) { FEED4; }
-
-    __device__ __forceinline__ int4 get_ans() {
-        int i1, i2, i3, i4;
-        ANS4(cb);
-        return ::make_int4(i1, i2, i3, i4);
-    }
-};
-
-#undef cb
-#undef COMMON_DEFS
-#undef SPEC_WITH_FEED_TYPE
-
-template <typename src_type, typename feed_type, typename inter_type>
-struct MeanIncludeRoundedPoolerBase;
-
-template <typename feed_type>
-struct MeanIncludeRoundedPoolerBase<int8_t, feed_type, int32_t> {
-    static constexpr int nr_results = sizeof(feed_type) / sizeof(int8_t);
-    int32_t res[nr_results];
-    const int count;
-    const float fi_count;
-
-    __device__ MeanIncludeRoundedPoolerBase(int count)
-            : count{count}, fi_count{1.f / count} {}
-    __device__ __forceinline__ void init() {
-#pragma unroll
-        for (int i = 0; i < nr_results; ++i) {
-            res[i] = 0;
-        }
-    }
-
-    __device__ __forceinline__ void feed(int32_t x, int idx) {
-        int8_t ix = (x & 0xff);
-        int8_t iy = ((x >> 8) & 0xff);
-        int8_t iz = ((x >> 16) & 0xff);
-        int8_t iw = ((x >> 24) & 0xff);
-        res[idx] += static_cast<int32_t>(ix);
-        res[idx + 1] += static_cast<int32_t>(iy);
-        res[idx + 2] += static_cast<int32_t>(iz);
-        res[idx + 3] += static_cast<int32_t>(iw);
-    }
-};
-
-template <typename src_type, typename feed_type, typename inter_type>
-struct MeanIncludeRoundedPooler;
-
-#define SPEC_WITH_FEED_TYPE(_feed_type)                          \
-    template <>                                                  \
-    struct MeanIncludeRoundedPooler<int8_t, _feed_type, int32_t> \
-            : MeanIncludeRoundedPoolerBase<int8_t, _feed_type, int32_t>
-
-#define COMMON_DEFS(_feed_type)                                             \
-    using feed_type = _feed_type;                                           \
-    using Base = MeanIncludeRoundedPoolerBase<int8_t, _feed_type, int32_t>; \
-    using MeanIncludeRoundedPoolerBase<int8_t, _feed_type,                  \
-                                       int32_t>::MeanIncludeRoundedPoolerBase;
-
-#define cb(_x, _y, _z, _w, _ret)                                          \
-    {                                                                     \
-        float fx = roundf(static_cast<float>(_x) * Base::fi_count);       \
-        float fy = roundf(static_cast<float>(_y) * Base::fi_count);       \
-        float fz = roundf(static_cast<float>(_z) * Base::fi_count);       \
-        float fw = roundf(static_cast<float>(_w) * Base::fi_count);       \
-        _ret = transform_float4_to_int8x4(::make_float4(fx, fy, fz, fw)); \
-    }
-
-SPEC_WITH_FEED_TYPE(int32_t) {
-    COMMON_DEFS(int32_t);
-    __device__ __forceinline__ void feed(int32_t x) { FEED1; }
-
-    __device__ __forceinline__ int get_ans() {
-        int i1;
-        ANS1(cb);
-        return i1;
-    }
-};
-
-SPEC_WITH_FEED_TYPE(int2) {
-    COMMON_DEFS(int2);
-    __device__ __forceinline__ void feed(int2 x) { FEED2; }
-    __device__ __forceinline__ int2 get_ans() {
-        int i1, i2;
-        ANS2(cb);
-        return ::make_int2(i1, i2);
-    }
-};
-
-SPEC_WITH_FEED_TYPE(int4) {
-    COMMON_DEFS(int4);
-    __device__ __forceinline__ void feed(int4 x) { FEED4; }
-
-    __device__ __forceinline__ int4 get_ans() {
-        int i1, i2, i3, i4;
-        ANS4(cb);
-        return ::make_int4(i1, i2, i3, i4);
-    }
-};
-
-#undef cb
-#undef COMMON_DEFS
-#undef SPEC_WITH_FEED_TYPE
-
-template <typename src_type, typename feed_type, typename inter_type>
-struct MeanExcludeRoundedPoolerBase;
-
-template <typename feed_type>
-struct MeanExcludeRoundedPoolerBase<int8_t, feed_type, int32_t> {
-    static const int nr_results = sizeof(feed_type) / sizeof(int8_t);
-    int32_t res[nr_results];
-    int count;
-
-    __device__ MeanExcludeRoundedPoolerBase(int /* count */) {}
-    __device__ __forceinline__ void init() {
-#pragma unroll
-        for (int i = 0; i < nr_results; ++i) {
-            res[i] = 0;
-        }
-        count = 0;
-    }
-
-    __device__ __forceinline__ void feed(int32_t x, int idx) {
-        int8_t ix = (x & 0xff);
-        int8_t iy = ((x >> 8) & 0xff);
-        int8_t iz = ((x >> 16) & 0xff);
-        int8_t iw = ((x >> 24) & 0xff);
-        res[idx] += static_cast<int32_t>(ix);
-        res[idx + 1] += static_cast<int32_t>(iy);
-        res[idx + 2] += static_cast<int32_t>(iz);
-        res[idx + 3] += static_cast<int32_t>(iw);
-    }
-};
-
-template <typename src_type, typename feed_type, typename inter_type>
-struct MeanExcludeRoundedPooler;
-
-#define SPEC_WITH_FEED_TYPE(_feed_type)                          \
-    template <>                                                  \
-    struct MeanExcludeRoundedPooler<int8_t, _feed_type, int32_t> \
-            : MeanExcludeRoundedPoolerBase<int8_t, _feed_type, int32_t>
-
-#define COMMON_DEFS(_feed_type)                                             \
-    using feed_type = _feed_type;                                           \
-    using Base = MeanExcludeRoundedPoolerBase<int8_t, _feed_type, int32_t>; \
-    using MeanExcludeRoundedPoolerBase<int8_t, _feed_type,                  \
-                                       int32_t>::MeanExcludeRoundedPoolerBase;
-
-#define cb(_x, _y, _z, _w, _ret)                                          \
-    {                                                                     \
-        float fx = roundf(static_cast<float>(_x) / Base::count);          \
-        float fy = roundf(static_cast<float>(_y) / Base::count);          \
-        float fz = roundf(static_cast<float>(_z) / Base::count);          \
-        float fw = roundf(static_cast<float>(_w) / Base::count);          \
-        _ret = transform_float4_to_int8x4(::make_float4(fx, fy, fz, fw)); \
-    }
-
-SPEC_WITH_FEED_TYPE(int32_t) {
-    COMMON_DEFS(int32_t);
-    __device__ __forceinline__ void feed(int32_t x) {
-        FEED1;
-        count++;
-    }
-
-    __device__ __forceinline__ int get_ans() {
-        int i1;
-        ANS1(cb);
-        return i1;
-    }
-};
-
-SPEC_WITH_FEED_TYPE(int2) {
-    COMMON_DEFS(int2);
-    __device__ __forceinline__ void feed(int2 x) {
-        FEED2;
-        count++;
-    }
-    __device__ __forceinline__ int2 get_ans() {
-        int i1, i2;
-        ANS2(cb);
-        return ::make_int2(i1, i2);
-    }
-};
-
-SPEC_WITH_FEED_TYPE(int4) {
-    COMMON_DEFS(int4);
-    __device__ __forceinline__ void feed(int4 x) {
-        FEED4;
-        count++;
-    }
-
-    __device__ __forceinline__ int4 get_ans() {
-        int i1, i2, i3, i4;
-        ANS4(cb);
-        return ::make_int4(i1, i2, i3, i4);
-    }
-};
-
-#undef cb
-#undef COMMON_DEFS
-#undef SPEC_WITH_FEED_TYPE
-
-template <typename Pooler>
-__global__ void pooling2d_device_template_int8_cdiv4hwn4(
-        const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) {
-    const int tidx = threadIdx.x;
-    const int tidy = threadIdx.y;
-    const int bidx = blockIdx.x;
-    const int bidy = blockIdx.y;
-    const int bidz = blockIdx.z;
-
-    using ldg_type = typename Pooler::feed_type;
-    static int constexpr pack_size = 4;
-    static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t);
-    const int batch = (bidy * blockDim.x + tidx) * ldg_width;
-    const int packed_ch = bidz * blockDim.y + tidy;
-    const int npack = param.n * pack_size;
-    if (batch >= param.n || packed_ch >= param.c / pack_size)
-        return;
-
-    const int ho = bidx / param.wo;
-    const int wo = bidx - param.wo * ho;
-    const int input_pixels = param.hi * param.wi;
-    const int output_pixels = param.ho * param.wo;
-    const int8_t* __restrict__ g_src_ptr =
-            src + batch * pack_size + packed_ch * input_pixels * npack;
-    int8_t* __restrict__ g_dst_ptr = dst + batch * pack_size +
-                                     packed_ch * output_pixels * npack +
-                                     (ho * param.wo + wo) * npack;
-
-    Pooler pooler(param.window_h * param.window_w);
-    pooler.init();
-    for (int fh = 0; fh < param.window_h; fh++) {
-        uint32_t ih = ho * param.sh + fh - param.ph;
-        for (int fw = 0; fw < param.window_w; fw++) {
-            uint32_t iw = wo * param.sw + fw - param.pw;
-            if (ih < param.hi && iw < param.wi) {
-                const int8_t* __restrict__ cur_src_ptr =
-                        g_src_ptr + (ih * param.wi + iw) * npack;
-                ldg_type sval =
-                        __ldg(reinterpret_cast<const ldg_type*>(cur_src_ptr));
-                pooler.feed(sval);
-            }
-        }
-    }
-    ldg_type res = pooler.get_ans();
-    *(reinterpret_cast<ldg_type*>(g_dst_ptr)) = res;
-}
-
-template <typename Pooler>
-__global__ void pooling2d_device_template_int8_ncdiv4hw4(
-        const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    using ldg_type = typename Pooler::feed_type;
-    static int constexpr pack_size = 4;
-    static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t);
-    MEGDNN_STATIC_ASSERT(
-            ldg_width == 1,
-            "pooling2d (NCHW4) kernel must use 32bit width ldg instruction");
-    const int wo_ldg = param.wo / ldg_width;
-    const int c_packed = param.c / pack_size;
-    const int batch = tid / (param.ho * wo_ldg * c_packed);
-    const int chw = tid - batch * param.ho * wo_ldg * c_packed;
-    const int oc_packed = chw / (param.ho * wo_ldg);
-    const int hw = chw - oc_packed * param.ho * wo_ldg;
-    const int oh = hw / wo_ldg;
-    const int ow = (hw - wo_ldg * oh) * ldg_width;
-    if (batch >= param.n || oc_packed >= c_packed || oh >= param.ho ||
-        ow >= param.wo)
-        return;
-
-    const int in_batch_stride = param.hi * param.wi * param.c;
-    const int out_batch_stride = param.ho * param.wo * param.c;
-    const int in_channel_stride = param.hi * param.wi * pack_size;
-    const int out_channel_stride = param.ho * param.wo * pack_size;
-    const int8_t* __restrict__ g_src_ptr =
-            src + batch * in_batch_stride + oc_packed * in_channel_stride;
-    int8_t* __restrict__ g_dst_ptr = dst + batch * out_batch_stride +
-                                     oc_packed * out_channel_stride +
-                                     (oh * param.wo + ow) * pack_size;
-
-    Pooler pooler(param.window_h * param.window_w);
-    pooler.init();
-    for (int fh = 0; fh < param.window_h; fh++) {
-        uint32_t ih = oh * param.sh + fh - param.ph;
-        for (int fw = 0; fw < param.window_w; fw++) {
-            uint32_t iw = ow * param.sw + fw - param.pw;
-            if (ih < param.hi && iw < param.wi) {
-                const int8_t* __restrict__ cur_src_ptr =
-                        g_src_ptr + (ih * param.wi + iw) * pack_size;
-                ldg_type sval = __ldg(reinterpret_cast<const ldg_type*>(cur_src_ptr));
-                pooler.feed(sval);
-            }
-        }
-    }
-    ldg_type res = pooler.get_ans();
-    *(reinterpret_cast<ldg_type*>(g_dst_ptr)) = res;
-}
-
-template <typename Pooler>
-__global__ void pooling2d_device_template_int8_ncdiv32hw32(
-        const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-
-    using ldg_type = typename Pooler::feed_type;
-    static int constexpr pack_size = 32;
-    static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t);
-    static int constexpr ldg_width_bytes = sizeof(ldg_type);
-    static int constexpr section = pack_size / sizeof(ldg_type);
-    MEGDNN_STATIC_ASSERT(
-            ldg_width == 4,
-            "pooling2d (NCHW32) kernel must use 128bit width ldg instruction");
-    const int c_packed = param.c / pack_size;
-    const int batch = tid / (param.ho * param.wo * c_packed * section);
-    const int batch_residual =
-            tid - batch * param.ho * param.wo * c_packed * section;
-    const int oc = batch_residual / (param.ho * param.wo * section);
-    const int oc_residual = batch_residual - oc * param.ho * param.wo * section;
-    const int oh = oc_residual / (param.wo * section);
-    const int oh_residual = (oc_residual - oh * param.wo * section);
-    const int ow = oh_residual / section;
-    const int sec = oh_residual - ow * section;
-    if (batch >= param.n || oc >= c_packed || oh >= param.ho || ow >= param.wo)
-        return;
-
-    const int in_batch_stride = param.hi * param.wi * param.c;
-    const int out_batch_stride = param.ho * param.wo * param.c;
-    const int in_channel_stride = param.hi * param.wi * pack_size;
-    const int out_channel_stride = param.ho * param.wo * pack_size;
-    const int8_t* __restrict__ g_src_ptr = src + batch * in_batch_stride +
-                                           oc * in_channel_stride +
-                                           sec * ldg_width_bytes;
-    int8_t* __restrict__ g_dst_ptr =
-            dst + batch * out_batch_stride + oc * out_channel_stride +
-            (oh * param.wo + ow) * pack_size + sec * ldg_width_bytes;
-
-    Pooler pooler(param.window_h * param.window_w);
-    pooler.init();
-    for (int fh = 0; fh < param.window_h; fh++) {
-        uint32_t ih = oh * param.sh + fh - param.ph;
-        for (int fw = 0; fw < param.window_w; fw++) {
-            uint32_t iw = ow * param.sw + fw - param.pw;
-            if (ih < param.hi && iw < param.wi) {
-                const int8_t* __restrict__ cur_src_ptr =
-                        g_src_ptr + (ih * param.wi + iw) * pack_size;
-                ldg_type sval =
-                        __ldg(reinterpret_cast<const ldg_type*>(cur_src_ptr));
-                pooler.feed(sval);
-            }
-        }
-    }
-    ldg_type res = pooler.get_ans();
-    *(reinterpret_cast<ldg_type*>(g_dst_ptr)) = res;
-}
-
-};  // namespace
-
-void megdnn::cuda::pooling2d::do_pooling2d_int8_cdiv4hwn4(const int8_t* d_src,
-                                                          int8_t* d_dst,
-                                                          const Param& param,
-                                                          cudaStream_t stream,
-                                                          uint32_t mode) {
-    using Mode = megdnn::param_enumv::Pooling::Mode;
-    void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param);
-    uint32_t vthreads_x = 0, vthreads_y = param.c / 4;
-#define dispatch_pooling_mode(_feed_type)                                   \
-    switch (mode) {                                                         \
-        case Mode::MAX:                                                     \
-            kern = pooling2d_device_template_int8_cdiv4hwn4<                \
-                    MaxPooler<int8_t, _feed_type>>;                         \
-            break;                                                          \
-        case Mode::AVERAGE:                                                 \
-            kern = pooling2d_device_template_int8_cdiv4hwn4<                \
-                    MeanIncludeRoundedPooler<int8_t, _feed_type, int32_t>>; \
-            break;                                                          \
-        case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:                           \
-            kern = pooling2d_device_template_int8_cdiv4hwn4<                \
-                    MeanExcludeRoundedPooler<int8_t, _feed_type, int32_t>>; \
-            break;                                                          \
-        default:                                                            \
-            megdnn_assert(false, "invalid pooling mode");                   \
-    }
-    if (param.n % 4 == 0) {
-        dispatch_pooling_mode(int4);
-        vthreads_x = param.n / 4;
-    } else if (param.n % 2 == 0) {
-        dispatch_pooling_mode(int2);
-        vthreads_x = param.n / 2;
-    } else {
-        dispatch_pooling_mode(int32_t);
-        vthreads_x = param.n;
-    }
-#undef dispatch_pooling_mode
-    constexpr uint32_t threads_x = 16;
-    uint32_t nr_threads = query_blocksize_for_kernel(kern);
-    uint32_t nr_threads_x = std::min(threads_x, vthreads_x),
-             nr_threads_y = std::min(nr_threads / nr_threads_x, vthreads_y);
-    uint32_t nr_blocks_x = param.ho * param.wo,
-             nr_blocks_y = DIVUP(vthreads_x, nr_threads_x),
-             nr_blocks_z = DIVUP(vthreads_y, nr_threads_y);
-    dim3 threads{nr_threads_x, nr_threads_y, 1};
-    dim3 blocks{nr_blocks_x, nr_blocks_y, nr_blocks_z};
-    kern<<<blocks, threads, 0, stream>>>(d_src, d_dst, param);
-    after_kernel_launch();
-}
-
-void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv4hw4(const int8_t* d_src,
-                                                          int8_t* d_dst,
-                                                          const Param& param,
-                                                          cudaStream_t stream,
-                                                          uint32_t mode) {
-    using Mode = megdnn::param_enumv::Pooling::Mode;
-    void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param);
-    uint32_t vthreads = param.n * param.c * param.ho * param.wo / 4;
-    switch (mode) {
-        case Mode::MAX:
-            kern = pooling2d_device_template_int8_ncdiv4hw4<
-                    MaxPooler<int8_t, int32_t>>;
-            break;
-        case Mode::AVERAGE:
-            kern = pooling2d_device_template_int8_ncdiv4hw4<
-                    MeanIncludeRoundedPooler<int8_t, int32_t, int32_t>>;
-            break;
-        case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
-            kern = pooling2d_device_template_int8_ncdiv4hw4<
-                    MeanExcludeRoundedPooler<int8_t, int32_t, int32_t>>;
-            break;
-        default:
-            megdnn_assert(false, "invalid pooling mode");
-    }
-    uint32_t nr_threads = query_blocksize_for_kernel(kern);
-    nr_threads = std::min(nr_threads, vthreads);
-    uint32_t nr_blocks = DIVUP(vthreads, nr_threads);
-    kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param);
-    after_kernel_launch();
-}
-
-void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv32hw32(const int8_t* d_src,
-                                                            int8_t* d_dst,
-                                                            const Param& param,
-                                                            cudaStream_t stream,
-                                                            uint32_t mode) {
-    using Mode = megdnn::param_enumv::Pooling::Mode;
-    void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param);
-    uint32_t vthreads = param.n * param.c * param.ho * param.wo / 16;
-    switch (mode) {
-        case Mode::MAX:
-            kern = pooling2d_device_template_int8_ncdiv32hw32<
-                    MaxPooler<int8_t, int4>>;
-            break;
-        case Mode::AVERAGE:
-            kern = pooling2d_device_template_int8_ncdiv32hw32<
-                    MeanIncludeRoundedPooler<int8_t, int4, int32_t>>;
-            break;
-        case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
-            kern = pooling2d_device_template_int8_ncdiv32hw32<
-                    MeanExcludeRoundedPooler<int8_t, int4, int32_t>>;
-            break;
-        default:
-            megdnn_assert(false, "invalid pooling mode");
-    }
-    uint32_t nr_threads = query_blocksize_for_kernel(kern);
-    nr_threads = std::min(nr_threads, vthreads);
-    uint32_t nr_blocks = DIVUP(vthreads, nr_threads);
-    kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param);
-    after_kernel_launch();
-}
-
-#undef FEED1
-#undef FEED2
-#undef FEED3
-#undef ANS1
-#undef ANS2
-#undef ANS4
-
-// vim: syntax=cuda.doxygen
diff --git a/dnn/src/cuda/pooling/pooling2d_qint.cu b/dnn/src/cuda/pooling/pooling2d_qint.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5a648951fb0857c22ce499797e7ef7d034e432b2
--- /dev/null
+++ b/dnn/src/cuda/pooling/pooling2d_qint.cu
@@ -0,0 +1,540 @@
+/**
+ * \file dnn/src/cuda/pooling/pooling2d_qint.cu
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "./pooling2d_qint.cuh"
+#include "src/common/opr_param_defs_enumv.cuh"
+#include "src/cuda/query_blocksize.cuh"
+
+using namespace megdnn;
+using namespace cuda;
+using namespace pooling2d;
+
+namespace {
+__device__ __forceinline__ int pack_int8_to_int8x4(int8_t x, int8_t y, int8_t z,
+                                                   int8_t w) {
+    int ix = static_cast<int>(x), iy = static_cast<int>(y),
+        iz = static_cast<int>(z), iw = static_cast<int>(w);
+
+    asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(ix) : "r"(iy));
+    asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(iz) : "r"(iw));
+    asm volatile("prmt.b32 %0, %0, %1, 0x5410;" : "+r"(ix) : "r"(iz));
+    return ix;
+}
+
+template <int regs, typename Dtype, typename OutDtype>
+__device__ __forceinline__ OutDtype pack_int8(int8_t (&x)[regs]);
+
+template <>
+__device__ __forceinline__ int pack_int8<4, int8_t, int>(int8_t (&x)[4]) {
+    return pack_int8_to_int8x4(x[0], x[1], x[2], x[3]);
+}
+
+template <>
+__device__ __forceinline__ int2 pack_int8<8, int8_t, int2>(int8_t (&x)[8]) {
+    int8_t x0[4]{x[0], x[1], x[2], x[3]};
+    int8_t x1[4]{x[4], x[5], x[6], x[7]};
+    return ::make_int2(pack_int8<4, int8_t, int>(x0),
+                       pack_int8<4, int8_t, int>(x1));
+}
+
+template <>
+__device__ __forceinline__ int4 pack_int8<16, int8_t, int4>(int8_t (&x)[16]) {
+    int8_t x0[4]{x[0], x[1], x[2], x[3]};
+    int8_t x1[4]{x[4], x[5], x[6], x[7]};
+    int8_t x2[4]{x[8], x[9], x[10], x[11]};
+    int8_t x3[4]{x[12], x[13], x[14], x[15]};
+    return ::make_int4(
+            pack_int8<4, int8_t, int>(x0), pack_int8<4, int8_t, int>(x1),
+            pack_int8<4, int8_t, int>(x2), pack_int8<4, int8_t, int>(x3));
+}
+
+__device__ __forceinline__ int8_t pack_int8_to_int4x2(int8_t x0, int8_t x1) {
+    return (x0 & 0xf) | (x1 << 4);
+}
+template <>
+__device__ __forceinline__ int pack_int8<8, dt_qint4, int>(int8_t (&x)[8]) {
+    int8_t x0 = pack_int8_to_int4x2(x[0], x[1]);
+    int8_t x1 = pack_int8_to_int4x2(x[2], x[3]);
+    int8_t x2 = pack_int8_to_int4x2(x[4], x[5]);
+    int8_t x3 = pack_int8_to_int4x2(x[6], x[7]);
+    return pack_int8_to_int8x4(x0, x1, x2, x3);
+}
+template <>
+__device__ __forceinline__ int4 pack_int8<32, dt_qint4, int4>(int8_t (&x)[32]) {
+    int8_t x0[8]{x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]};
+    int8_t x1[8]{x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]};
+    int8_t x2[8]{x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23]};
+    int8_t x3[8]{x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]};
+    return ::make_int4(
+            pack_int8<8, dt_qint4, int>(x0), pack_int8<8, dt_qint4, int>(x1),
+            pack_int8<8, dt_qint4, int>(x2), pack_int8<8, dt_qint4, int>(x3));
+}
+
+template <typename Dtype>
+struct TypeTrait;
+
+template <>
+struct TypeTrait<int8_t> {
+    static constexpr int bit_width = 8;
+    static constexpr int mask = 0xff;
+    static constexpr int8_t min = -128;
+    static constexpr int elem_per_32bit = 32 / bit_width;
+    static constexpr int shift_fix_sign = 0;
+};
+
+template <>
+struct TypeTrait<dt_qint4> {
+    static constexpr int bit_width = 4;
+    static constexpr int mask = 0xf;
+    static constexpr int8_t min = -8;
+    static constexpr int elem_per_32bit = 32 / bit_width;
+    static constexpr int shift_fix_sign = 4;
+};
+
+template <typename src_type, typename _feed_type>
+struct MaxPooler {
+    using feed_type = _feed_type;
+    static constexpr int bit_width = TypeTrait<src_type>::bit_width;
+    static constexpr int nr_results = sizeof(feed_type) * 8 / bit_width;
+    static constexpr int elem_per_32bit = TypeTrait<src_type>::elem_per_32bit;
+    static constexpr int shift_fix_sign = TypeTrait<src_type>::shift_fix_sign;
+    int8_t res[nr_results];
+
+    __device__ MaxPooler(int) {}
+    __device__ __forceinline__ void init() {
+#pragma unroll
+        for (int i = 0; i < nr_results; ++i) {
+            res[i] = TypeTrait<src_type>::min;
+        }
+    }
+    __device__ __forceinline__ void feed(int x, int idx = 0) {
+        constexpr int unroll_n = sizeof(int) * 8 / bit_width;
+#pragma unroll
+        for (int i = 0; i < unroll_n; i++) {
+            int8_t temp = ((x >> (i * bit_width)) & TypeTrait<src_type>::mask)
+                          << shift_fix_sign;
+            temp = temp >> shift_fix_sign;
+            res[idx + i] = res[idx + i] > temp ? res[idx + i] : temp;
+        }
+    }
+    __device__ __forceinline__ void feed(int2 x) {
+        feed(x.x, 0 * elem_per_32bit);
+        feed(x.y, 1 * elem_per_32bit);
+    }
+    __device__ __forceinline__ void feed(int4 x) {
+        feed(x.x, 0 * elem_per_32bit);
+        feed(x.y, 1 * elem_per_32bit);
+        feed(x.z, 2 * elem_per_32bit);
+        feed(x.w, 3 * elem_per_32bit);
+    }
+    __device__ __forceinline__ feed_type get_ans() {
+        feed_type ans;
+        ans = pack_int8<nr_results, src_type, feed_type>(res);
+        return ans;
+    }
+};
+
+template <typename src_type, typename _feed_type, typename inter_type>
+struct MeanIncludeRoundedPooler {
+    using feed_type = _feed_type;
+    static constexpr int bit_width = TypeTrait<src_type>::bit_width;
+    static constexpr int nr_results = sizeof(feed_type) * 8 / bit_width;
+    static constexpr int elem_per_32bit = TypeTrait<src_type>::elem_per_32bit;
+    static constexpr int shift_fix_sign = TypeTrait<src_type>::shift_fix_sign;
+
+    int32_t res[nr_results];
+    const int count;
+    const float fi_count;
+
+    __device__ MeanIncludeRoundedPooler(int count)
+            : count{count}, fi_count{1.f / count} {}
+
+    __device__ __forceinline__ void init() {
+#pragma unroll
+        for (int i = 0; i < nr_results; ++i) {
+            res[i] = 0;
+        }
+    }
+    __device__ __forceinline__ void feed(int x, int idx = 0) {
+        constexpr int unroll_n = sizeof(int) * 8 / bit_width;
+#pragma unroll
+        for (int i = 0; i < unroll_n; i++) {
+            int8_t temp = ((x >> (i * bit_width)) & TypeTrait<src_type>::mask)
+                          << shift_fix_sign;
+            temp = temp >> shift_fix_sign;
+            res[idx + i] += static_cast<int32_t>(temp);
+        }
+    }
+    __device__ __forceinline__ void feed(int2 x) {
+        feed(x.x, 0 * elem_per_32bit);
+        feed(x.y, 1 * elem_per_32bit);
+    }
+    __device__ __forceinline__ void feed(int4 x) {
+        feed(x.x, 0 * elem_per_32bit);
+        feed(x.y, 1 * elem_per_32bit);
+        feed(x.z, 2 * elem_per_32bit);
+        feed(x.w, 3 * elem_per_32bit);
+    }
+    __device__ __forceinline__ feed_type get_ans() {
+        feed_type ans;
+        int8_t out_res[nr_results];
+#pragma unroll
+        for (int i = 0; i < nr_results; i++) {
+            float f32_res = roundf(static_cast<float>(res[i]) * fi_count);
+            int i8_res;
+            asm volatile("cvt.rni.s8.f32 %0, %1;"
+                         : "=r"(i8_res)
+                         : "f"(f32_res));
+            out_res[i] = i8_res;
+        }
+        ans = pack_int8<nr_results, src_type, feed_type>(out_res);
+        return ans;
+    }
+};
+
+template <typename src_type, typename _feed_type, typename inter_type>
+struct MeanExcludeRoundedPooler {
+    using feed_type = _feed_type;
+    static constexpr int bit_width = TypeTrait<src_type>::bit_width;
+    static constexpr int nr_results = sizeof(feed_type) * 8 / bit_width;
+    static constexpr int elem_per_32bit = TypeTrait<src_type>::elem_per_32bit;
+    static constexpr int shift_fix_sign = TypeTrait<src_type>::shift_fix_sign;
+    int32_t res[nr_results];
+    int count;
+    __device__ MeanExcludeRoundedPooler(int) {}
+
+    __device__ __forceinline__ void init() {
+#pragma unroll
+        for (int i = 0; i < nr_results; ++i) {
+            res[i] = 0;
+        }
+        count = 0;
+    }
+    __device__ __forceinline__ void feed(int x, int idx) {
+        constexpr int unroll_n = sizeof(int) * 8 / bit_width;
+#pragma unroll
+        for (int i = 0; i < unroll_n; i++) {
+            int8_t temp = ((x >> (i * bit_width)) & TypeTrait<src_type>::mask)
+                          << shift_fix_sign;
+            temp = temp >> shift_fix_sign;
+            res[idx + i] += static_cast<int32_t>(temp);
+        }
+    }
+    __device__ __forceinline__ void feed(int x) {
+        feed(x, 0);
+        count++;
+    }
+
+    __device__ __forceinline__ void feed(int2 x) {
+        feed(x.x, 0 * elem_per_32bit);
+        feed(x.y, 1 * elem_per_32bit);
+        count++;
+    }
+    __device__ __forceinline__ void feed(int4 x) {
+        feed(x.x, 0 * elem_per_32bit);
+        feed(x.y, 1 * elem_per_32bit);
+        feed(x.z, 2 * elem_per_32bit);
+        feed(x.w, 3 * elem_per_32bit);
+        count++;
+    }
+    __device__ __forceinline__ feed_type get_ans() {
+        feed_type ans;
+        int8_t out_res[nr_results];
+#pragma unroll
+        for (int i = 0; i < nr_results; i++) {
+            float f32_res = roundf(static_cast<float>(res[i]) / count);
+            int i8_res;
+            asm volatile("cvt.rni.s8.f32 %0, %1;"
+                         : "=r"(i8_res)
+                         : "f"(f32_res));
+            out_res[i] = i8_res;
+        }
+        ans = pack_int8<nr_results, src_type, feed_type>(out_res);
+        return ans;
+    }
+};
+
+template <typename Pooler>
+__global__ void pooling2d_device_template_int8_cdiv4hwn4(
+        const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) {
+    const int tidx = threadIdx.x;
+    const int tidy = threadIdx.y;
+    const int bidx = blockIdx.x;
+    const int bidy = blockIdx.y;
+    const int bidz = blockIdx.z;
+
+    using ldg_type = typename Pooler::feed_type;
+    static int constexpr pack_size = 4;
+    static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t);
+    const int batch = (bidy * blockDim.x + tidx) * ldg_width;
+    const int packed_ch = bidz * blockDim.y + tidy;
+    const int npack = param.n * pack_size;
+    if (batch >= param.n || packed_ch >= param.c / pack_size)
+        return;
+
+    const int ho = bidx / param.wo;
+    const int wo = bidx - param.wo * ho;
+    const int input_pixels = param.hi * param.wi;
+    const int output_pixels = param.ho * param.wo;
+    const int8_t* __restrict__ g_src_ptr =
+            src + batch * pack_size + packed_ch * input_pixels * npack;
+    int8_t* __restrict__ g_dst_ptr = dst + batch * pack_size +
+                                     packed_ch * output_pixels * npack +
+                                     (ho * param.wo + wo) * npack;
+
+    Pooler pooler(param.window_h * param.window_w);
+    pooler.init();
+    for (int fh = 0; fh < param.window_h; fh++) {
+        uint32_t ih = ho * param.sh + fh - param.ph;
+        for (int fw = 0; fw < param.window_w; fw++) {
+            uint32_t iw = wo * param.sw + fw - param.pw;
+            if (ih < param.hi && iw < param.wi) {
+                const int8_t* __restrict__ cur_src_ptr =
+                        g_src_ptr + (ih * param.wi + iw) * npack;
+                ldg_type sval =
+                        __ldg(reinterpret_cast<const ldg_type*>(cur_src_ptr));
+                pooler.feed(sval);
+            }
+        }
+    }
+    ldg_type res = pooler.get_ans();
+    *(reinterpret_cast<ldg_type*>(g_dst_ptr)) = res;
+}
+
+template <typename Pooler, int pack_size, int pack_byte,
+          int ldg_width_assert = 4>
+__global__ void pooling2d_device_template_nchwc(const int8_t* __restrict__ src,
+                                                int8_t* __restrict__ dst,
+                                                Param param) {
+    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    using ldg_type = typename Pooler::feed_type;
+    static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t);
+    static int constexpr ldg_width_bytes = sizeof(ldg_type);
+    static int constexpr section = pack_byte / sizeof(ldg_type);
+    MEGDNN_STATIC_ASSERT(
+            ldg_width == ldg_width_assert,
+            "pooling2d (NCHW64) kernel must use 128bit width ldg instruction");
+    const int c_packed = param.c / pack_size;
+    const int batch = tid / (param.ho * param.wo * c_packed * section);
+    const int batch_residual =
+            tid - batch * param.ho * param.wo * c_packed * section;
+    const int oc = batch_residual / (param.ho * param.wo * section);
+    const int oc_residual = batch_residual - oc * param.ho * param.wo * section;
+    const int oh = oc_residual / (param.wo * section);
+    const int oh_residual = (oc_residual - oh * param.wo * section);
+    const int ow = oh_residual / section;
+    const int sec = oh_residual - ow * section;
+    if (batch >= param.n || oc >= c_packed || oh >= param.ho || ow >= param.wo)
+        return;
+
+    const int in_batch_stride =
+            param.hi * param.wi * param.c * pack_byte / pack_size;
+    const int out_batch_stride =
+            param.ho * param.wo * param.c * pack_byte / pack_size;
+    const int in_channel_stride = param.hi * param.wi * pack_byte;
+    const int out_channel_stride = param.ho * param.wo * pack_byte;
+    const int8_t* __restrict__ g_src_ptr =
+            src + (batch * in_batch_stride + oc * in_channel_stride +
+                   sec * ldg_width_bytes);
+    int8_t* __restrict__ g_dst_ptr =
+            dst + (batch * out_batch_stride + oc * out_channel_stride +
+                   (oh * param.wo + ow) * pack_byte + sec * ldg_width_bytes);
+
+    Pooler pooler(param.window_h * param.window_w);
+    pooler.init();
+    for (int fh = 0; fh < param.window_h; fh++) {
+        uint32_t ih = oh * param.sh + fh - param.ph;
+        for (int fw = 0; fw < param.window_w; fw++) {
+            uint32_t iw = ow * param.sw + fw - param.pw;
+            if (ih < param.hi && iw < param.wi) {
+                const int8_t* __restrict__ cur_src_ptr =
+                        g_src_ptr + (ih * param.wi + iw) * pack_byte;
+                ldg_type sval =
+                        __ldg(reinterpret_cast<const ldg_type*>(cur_src_ptr));
+                pooler.feed(sval);
+            }
+        }
+    }
+    ldg_type res = pooler.get_ans();
+    *(reinterpret_cast<ldg_type*>(g_dst_ptr)) = res;
+}
+
+};  // namespace
+
+void megdnn::cuda::pooling2d::do_pooling2d_int8_cdiv4hwn4(const int8_t* d_src,
+                                                          int8_t* d_dst,
+                                                          const Param& param,
+                                                          cudaStream_t stream,
+                                                          uint32_t mode) {
+    using Mode = megdnn::param_enumv::Pooling::Mode;
+    void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param);
+    uint32_t vthreads_x = 0, vthreads_y = param.c / 4;
+#define dispatch_pooling_mode(_feed_type)                                   \
+    switch (mode) {                                                         \
+        case Mode::MAX:                                                     \
+            kern = pooling2d_device_template_int8_cdiv4hwn4<                \
+                    MaxPooler<int8_t, _feed_type>>;                         \
+            break;                                                          \
+        case Mode::AVERAGE:                                                 \
+            kern = pooling2d_device_template_int8_cdiv4hwn4<                \
+                    MeanIncludeRoundedPooler<int8_t, _feed_type, int32_t>>; \
+            break;                                                          \
+        case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:                           \
+            kern = pooling2d_device_template_int8_cdiv4hwn4<                \
+                    MeanExcludeRoundedPooler<int8_t, _feed_type, int32_t>>; \
+            break;                                                          \
+        default:                                                            \
+            megdnn_assert(false, "invalid pooling mode");                   \
+    }
+    if (param.n % 4 == 0) {
+        dispatch_pooling_mode(int4);
+        vthreads_x = param.n / 4;
+    } else if (param.n % 2 == 0) {
+        dispatch_pooling_mode(int2);
+        vthreads_x = param.n / 2;
+    } else {
+        dispatch_pooling_mode(int32_t);
+        vthreads_x = param.n;
+    }
+#undef dispatch_pooling_mode
+    constexpr uint32_t threads_x = 16;
+    uint32_t nr_threads = query_blocksize_for_kernel(kern);
+    uint32_t nr_threads_x = std::min(threads_x, vthreads_x),
+             nr_threads_y = std::min(nr_threads / nr_threads_x, vthreads_y);
+    uint32_t nr_blocks_x = param.ho * param.wo,
+             nr_blocks_y = DIVUP(vthreads_x, nr_threads_x),
+             nr_blocks_z = DIVUP(vthreads_y, nr_threads_y);
+    dim3 threads{nr_threads_x, nr_threads_y, 1};
+    dim3 blocks{nr_blocks_x, nr_blocks_y, nr_blocks_z};
+    kern<<<blocks, threads, 0, stream>>>(d_src, d_dst, param);
+    after_kernel_launch();
+}
+
+void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv4hw4(const int8_t* d_src,
+                                                          int8_t* d_dst,
+                                                          const Param& param,
+                                                          cudaStream_t stream,
+                                                          uint32_t mode) {
+    using Mode = megdnn::param_enumv::Pooling::Mode;
+    void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param);
+    constexpr int ldg_byte = 4;
+    constexpr int elem_per_byte = 1;
+    constexpr int pack_size = 4;
+    constexpr int pack_byte = pack_size / elem_per_byte;
+    constexpr int elem_per_thread = ldg_byte * elem_per_byte;
+    constexpr int ldg_assert_width = ldg_byte / sizeof(int32_t);
+    uint32_t vthreads =
+            param.n * param.c * param.ho * param.wo / elem_per_thread;
+    switch (mode) {
+        case Mode::MAX:
+            kern = pooling2d_device_template_nchwc<MaxPooler<int8_t, int32_t>,
+                                                   pack_size, pack_byte,
+                                                   ldg_assert_width>;
+            break;
+        case Mode::AVERAGE:
+            kern = pooling2d_device_template_nchwc<
+                    MeanIncludeRoundedPooler<int8_t, int32_t, int32_t>,
+                    pack_size, pack_byte, ldg_assert_width>;
+            break;
+        case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
+            kern = pooling2d_device_template_nchwc<
+                    MeanExcludeRoundedPooler<int8_t, int32_t, int32_t>,
+                    pack_size, pack_byte, ldg_assert_width>;
+            break;
+        default:
+            megdnn_assert(false, "invalid pooling mode");
+    }
+    uint32_t nr_threads = query_blocksize_for_kernel(kern);
+    nr_threads = std::min(nr_threads, vthreads);
+    uint32_t nr_blocks = DIVUP(vthreads, nr_threads);
+    kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param);
+    after_kernel_launch();
+}
+
+void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv32hw32(const int8_t* d_src,
+                                                            int8_t* d_dst,
+                                                            const Param& param,
+                                                            cudaStream_t stream,
+                                                            uint32_t mode) {
+    using Mode = megdnn::param_enumv::Pooling::Mode;
+    void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param);
+    constexpr int ldg_byte = 16;
+    constexpr int elem_per_byte = 1;
+    constexpr int pack_size = 32;
+    constexpr int pack_byte = pack_size / elem_per_byte;
+    constexpr int elem_per_thread = ldg_byte * elem_per_byte;
+    uint32_t vthreads =
+            param.n * param.c * param.ho * param.wo / elem_per_thread;
+    switch (mode) {
+        case Mode::MAX:
+            kern = pooling2d_device_template_nchwc<MaxPooler<int8_t, int4>,
+                                                   pack_size, pack_byte>;
+            break;
+        case Mode::AVERAGE:
+            kern = pooling2d_device_template_nchwc<
+                    MeanIncludeRoundedPooler<int8_t, int4, int32_t>, pack_size,
+                    pack_byte>;
+            break;
+        case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
+            kern = pooling2d_device_template_nchwc<
+                    MeanExcludeRoundedPooler<int8_t, int4, int32_t>, pack_size,
+                    pack_byte>;
+            break;
+        default:
+            megdnn_assert(false, "invalid pooling mode");
+    }
+    uint32_t nr_threads = query_blocksize_for_kernel(kern);
+    nr_threads = std::min(nr_threads, vthreads);
+    uint32_t nr_blocks = DIVUP(vthreads, nr_threads);
+    kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param);
+    after_kernel_launch();
+}
+
+void megdnn::cuda::pooling2d::do_pooling2d_int4_ncdiv64hw64(const int8_t* d_src,
+                                                            int8_t* d_dst,
+                                                            const Param& param,
+                                                            cudaStream_t stream,
+                                                            uint32_t mode) {
+    using Mode = megdnn::param_enumv::Pooling::Mode;
+    void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param);
+    constexpr int ldg_byte = 16;
+    constexpr int elem_per_byte = 2;
+    constexpr int pack_size = 64;
+    constexpr int pack_byte = pack_size / elem_per_byte;
+    constexpr int elem_per_thread = ldg_byte * elem_per_byte;
+    uint32_t vthreads =
+            param.n * param.c * param.ho * param.wo / elem_per_thread;
+    switch (mode) {
+        case Mode::MAX:
+            kern = pooling2d_device_template_nchwc<MaxPooler<dt_qint4, int4>,
+                                                   pack_size, pack_byte>;
+            break;
+        case Mode::AVERAGE:
+            kern = pooling2d_device_template_nchwc<
+                    MeanIncludeRoundedPooler<dt_qint4, int4, int32_t>,
+                    pack_size, pack_byte>;
+            break;
+        case Mode::AVERAGE_COUNT_EXCLUDE_PADDING:
+            kern = pooling2d_device_template_nchwc<
+                    MeanExcludeRoundedPooler<dt_qint4, int4, int32_t>,
+                    pack_size, pack_byte>;
+            break;
+        default:
+            megdnn_assert(false, "invalid pooling mode");
+    }
+    uint32_t nr_threads = query_blocksize_for_kernel(kern);
+    nr_threads = std::min(nr_threads, vthreads);
+    uint32_t nr_blocks = DIVUP(vthreads, nr_threads);
+    kern<<<nr_blocks, nr_threads, 0, stream>>>(d_src, d_dst, param);
+    after_kernel_launch();
+}
+
+// vim: syntax=cuda.doxygen
diff --git a/dnn/src/cuda/pooling/pooling2d_int8.cuh b/dnn/src/cuda/pooling/pooling2d_qint.cuh
similarity index 83%
rename from dnn/src/cuda/pooling/pooling2d_int8.cuh
rename to dnn/src/cuda/pooling/pooling2d_qint.cuh
index b5554ba8f3f10bc6aa8d6766ca6094890e89c7ec..ed24e526e2c817193cdac320e0d0a1dad10b1f7a 100644
--- a/dnn/src/cuda/pooling/pooling2d_int8.cuh
+++ b/dnn/src/cuda/pooling/pooling2d_qint.cuh
@@ -1,5 +1,5 @@
 /**
- * \file dnn/src/cuda/pooling/pooling2d_int8.cuh
+ * \file dnn/src/cuda/pooling/pooling2d_qint.cuh
  * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  *
  * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
@@ -32,6 +32,11 @@ void do_pooling2d_int8_ncdiv4hw4(const int8_t* d_src, int8_t* d_dst,
 void do_pooling2d_int8_ncdiv32hw32(const int8_t* d_src, int8_t* d_dst,
                                    const Param& param, cudaStream_t stream,
                                    uint32_t mode);
+
+void do_pooling2d_int4_ncdiv64hw64(const int8_t* d_src, int8_t* d_dst,
+                                   const Param& param, cudaStream_t stream,
+                                   uint32_t mode);
+
 }  // namespace pooling2d
 }  // namespace cuda
 }  // namespace megdnn
diff --git a/dnn/src/naive/pooling/opr_impl.cpp b/dnn/src/naive/pooling/opr_impl.cpp
index ed978826507c32b1270f5c77c7991499ed9a338b..902780c41bd627c690a72ade5635574a51e234d5 100644
--- a/dnn/src/naive/pooling/opr_impl.cpp
+++ b/dnn/src/naive/pooling/opr_impl.cpp
@@ -15,6 +15,7 @@
 #include "megdnn/dtype.h"
 #include "src/common/utils.h"
 #include "src/naive/handle.h"
+#include "src/naive/lowbit_utils.h"
 
 #include "midout.h"
 MIDOUT_DECL(megdnn_naive_pooling)
@@ -190,6 +191,12 @@ struct NCHW32IdxGetter {
         return (((n * (C >> 5) + (c >> 5)) * H + h) * W + w) * 32 + (c & 0x1f);
     }
 };
+struct NCHW64IdxGetter {
+    static size_t get_idx(size_t n, size_t c, size_t h, size_t w, size_t,
+                          size_t C, size_t H, size_t W) {
+        return (((n * (C >> 6) + (c >> 6)) * H + h) * W + w) * 64 + (c & 0x3f);
+    }
+};
 /*!
  * Pooler for AVERAGE_COUNT_EXCLUDE_PADDING mode
  */
@@ -375,15 +382,81 @@ void pooling_backward_max_impl(const ctype* __restrict src,
 namespace megdnn {
 namespace naive {
 
+WorkspaceBundle PoolingForwardImpl::get_workspace_bundle(
+        void* ptr, const TensorLayout& src, const TensorLayout& dst) const {
+    SmallVector<size_t> sizes;
+    TensorLayout fsrc = src;
+    TensorLayout fdst = dst;
+    auto get_workspace = [&sizes](TensorLayout& layout) {
+        if (layout.dtype.enumv() == DTypeEnum::Quantized4Asymm ||
+            layout.dtype.enumv() == DTypeEnum::QuantizedS4) {
+            layout.dtype = dtype::Int8();
+            layout.format = TensorLayout::Format(layout.dtype);
+            sizes.push_back(layout.span().dist_byte());
+        }
+    };
+    get_workspace(fsrc);
+    get_workspace(fdst);
+    return {ptr, std::move(sizes)};
+};
+
+size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src,
+                                                  const TensorLayout& dst) {
+    return get_workspace_bundle(nullptr, src, dst).total_size_in_bytes();
+}
+namespace {
+
+void post_process(const TensorND& dst, TensorND& comp_dst, Handle* handle,
+                  WorkspaceBundle& workspace_bundle) {
+    if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) {
+        int8_to_int4(comp_dst, dst);
+    } else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+        uint8_to_uint4(comp_dst, dst);
+    }
+}
+
+}  // namespace
+
 void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
                               _megdnn_workspace workspace) {
     check_exec(src.layout, dst.layout, workspace.size);
+    TensorND comp_src = src;
+    TensorND comp_dst = dst;
+
+    auto wsb = get_workspace_bundle(workspace.raw_ptr, src.layout, dst.layout);
+    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) {
+        float scale = src.layout.dtype.param<dtype::QuantizedS4>().scale;
+        comp_src.layout.dtype = dtype::QuantizedS8(scale);
+        comp_src.layout.init_contiguous_stride();
+        comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype);
+        comp_src.raw_ptr = wsb.get(0);
+        comp_dst.layout.dtype = dtype::QuantizedS8(scale);
+        comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype);
+        comp_dst.layout.init_contiguous_stride();
+        comp_dst.raw_ptr = wsb.get(1);
+        int4_to_int8(src, comp_src);
+    } else if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
+        float scale = src.layout.dtype.param<dtype::Quantized4Asymm>().scale;
+        uint8_t zero_point =
+                src.layout.dtype.param<dtype::Quantized4Asymm>().zero_point;
+        comp_src.layout.dtype = dtype::Quantized8Asymm(scale, zero_point);
+        comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype);
+        comp_src.layout.init_contiguous_stride();
+        comp_src.raw_ptr = wsb.get(0);
+        comp_dst.layout.dtype = dtype::Quantized8Asymm(scale, zero_point);
+        comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype);
+        comp_dst.layout.init_contiguous_stride();
+        comp_dst.raw_ptr = wsb.get(1);
+        uint4_to_uint8(src, comp_src);
+    }
+
     size_t c_pos, spatial_pos, batch_pos = 0;
     if (param().format == Param::Format::NCHW ||
         param().format == Param::Format::NCHW4 ||
         param().format == Param::Format::NCHW88 ||
         param().format == Param::Format::NCHW44 ||
-        param().format == Param::Format::NCHW32) {
+        param().format == Param::Format::NCHW32 ||
+        param().format == Param::Format::NCHW64) {
         c_pos = 1;
         spatial_pos = 2;
     } else if (param().format == Param::Format::NHWC) {
@@ -398,27 +471,35 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
         c_pos = 2;
         spatial_pos = 1;
     }
-    size_t N = src.layout.shape[batch_pos], C = src.layout.shape[c_pos],
-           IH = src.layout.shape[spatial_pos + 0],
-           IW = src.layout.shape[spatial_pos + 1];
-    size_t OH = dst.layout.shape[spatial_pos + 0],
-           OW = dst.layout.shape[spatial_pos + 1];
-    if (param().format == Param::Format::NHWCD4) {
-        C *= 4;
-        IW = src.layout.shape[spatial_pos + 2];
-        OW = dst.layout.shape[spatial_pos + 2];
-    }
-    if (param().format == Param::Format::NCHW4 ||
-        param().format == Param::Format::NCHW44 ||
-        param().format == Param::Format::CHWN4) {
-        C *= 4;
-    }
-    if (param().format == Param::Format::NCHW88) {
-        C *= 8;
-    }
-    if (param().format == Param::Format::NCHW32) {
-        C *= 32;
+    size_t N = comp_src.layout.shape[batch_pos],
+           C = comp_src.layout.shape[c_pos],
+           IH = comp_src.layout.shape[spatial_pos + 0],
+           IW = comp_src.layout.shape[spatial_pos + 1];
+    size_t OH = comp_dst.layout.shape[spatial_pos + 0],
+           OW = comp_dst.layout.shape[spatial_pos + 1];
+    switch (param().format) {
+        case Param::Format::NHWCD4:
+            C *= 4;
+            IW = comp_src.layout.shape[spatial_pos + 2];
+            OW = comp_dst.layout.shape[spatial_pos + 2];
+            break;
+        case Param::Format::NCHW4:
+        case Param::Format::NCHW44:
+        case Param::Format::CHWN4:
+            C *= 4;
+            break;
+        case Param::Format::NCHW88:
+            C *= 8;
+            break;
+        case Param::Format::NCHW32:
+            C *= 32;
+            break;
+        case Param::Format::NCHW64:
+            C *= 64;
+            break;
+        default:;
     }
+
     size_t PH = param().pad_h, PW = param().pad_w;
     size_t FH = param().window_h, FW = param().window_w;
     size_t SH = param().stride_h, SW = param().stride_w;
@@ -427,8 +508,8 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
         MEGDNN_DISPATCH_CPU_KERN(                                              \
                 static_cast<naive::HandleImpl*>(handle()),                     \
                 pooling_forward_impl<Pooler MEGDNN_COMMA IdxGetter>(           \
-                        sptr, dptr, src.layout.dtype, N, C, IH, IW, OH, OW,    \
-                        PH, PW, SH, SW, FH, FW));                              \
+                        sptr, dptr, comp_src.layout.dtype, N, C, IH, IW, OH,   \
+                        OW, PH, PW, SH, SW, FH, FW));                          \
     }                                                                          \
     MIDOUT_END();
 
@@ -455,6 +536,9 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
         case Param::Format::NCHW32:                                       \
             DISPATCH_WITH_POOLER_AND_IDX_GETTER(Pooler, NCHW32IdxGetter); \
             break;                                                        \
+        case Param::Format::NCHW64:                                       \
+            DISPATCH_WITH_POOLER_AND_IDX_GETTER(Pooler, NCHW64IdxGetter); \
+            break;                                                        \
         case Param::Format::CHWN4:                                        \
             DISPATCH_WITH_POOLER_AND_IDX_GETTER(Pooler, CHWN4IdxGetter);  \
             break;                                                        \
@@ -462,30 +546,35 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
             megdnn_throw("invalid pooling format");                       \
     }
 
-#define cb(DType)                                               \
-    if (src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \
-        using ctype = typename DTypeTrait<DType>::ctype;        \
-        switch (param().mode) {                                 \
-            case Mode::MAX: {                                   \
-                auto sptr = src.ptr<ctype>();                   \
-                auto dptr = dst.ptr<ctype>();                   \
-                DISPATCH_WITH_POOLER(MaxPooler<ctype>);         \
-                return;                                         \
-            }                                                   \
-            case Mode::AVERAGE: {                               \
-                auto sptr = src.ptr<ctype>();                   \
-                auto dptr = dst.ptr<ctype>();                   \
-                DISPATCH_WITH_POOLER(MeanIncludePooler<ctype>); \
-                return;                                         \
-            }                                                   \
-            case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: {         \
-                auto sptr = src.ptr<ctype>();                   \
-                auto dptr = dst.ptr<ctype>();                   \
-                DISPATCH_WITH_POOLER(MeanExcludePooler<ctype>); \
-                return;                                         \
-            }                                                   \
-        }                                                       \
+#define cb(DType)                                                    \
+    if (comp_src.layout.dtype.enumv() == DTypeTrait<DType>::enumv) { \
+        using ctype = typename DTypeTrait<DType>::ctype;             \
+        switch (param().mode) {                                      \
+            case Mode::MAX: {                                        \
+                auto sptr = comp_src.ptr<ctype>();                   \
+                auto dptr = comp_dst.ptr<ctype>();                   \
+                DISPATCH_WITH_POOLER(MaxPooler<ctype>);              \
+                break;                                               \
+            }                                                        \
+            case Mode::AVERAGE: {                                    \
+                auto sptr = comp_src.ptr<ctype>();                   \
+                auto dptr = comp_dst.ptr<ctype>();                   \
+                DISPATCH_WITH_POOLER(MeanIncludePooler<ctype>);      \
+                break;                                               \
+            }                                                        \
+            case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: {              \
+                auto sptr = comp_src.ptr<ctype>();                   \
+                auto dptr = comp_dst.ptr<ctype>();                   \
+                DISPATCH_WITH_POOLER(MeanExcludePooler<ctype>);      \
+                break;                                               \
+            }                                                        \
+            default:                                                 \
+                megdnn_assert(0, "not support mode");                \
+        }                                                            \
+        post_process(dst, comp_dst, handle(), wsb);                  \
+        return;                                                      \
     }
+
     MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
     MEGDNN_FOREACH_QUANTIZED_DTYPE(cb)
 #undef cb
diff --git a/dnn/src/naive/pooling/opr_impl.h b/dnn/src/naive/pooling/opr_impl.h
index ed56fb646b7ea1b9dc51d9c904a40ac70ba74f8a..1590785195dff32d3a82432a2eaa1a3af07e80ed 100644
--- a/dnn/src/naive/pooling/opr_impl.h
+++ b/dnn/src/naive/pooling/opr_impl.h
@@ -20,10 +20,12 @@ class PoolingForwardImpl: public PoolingForward {
         using PoolingForward::PoolingForward;
         void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
                 _megdnn_workspace workspace) override;
-        size_t get_workspace_in_bytes(const TensorLayout &,
-                const TensorLayout &) override {
-            return 0;
-        }
+        size_t get_workspace_in_bytes(const TensorLayout&,
+                                      const TensorLayout&) override;
+
+    private:
+        WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&,
+                                             const TensorLayout&) const;
 };
 
 class PoolingBackwardImpl : public PoolingBackward {
diff --git a/dnn/test/common/checker.h b/dnn/test/common/checker.h
index a175b3671d39d4d1bc256ff64ff7e845f240745c..e727cf622d67df313096b395055dc0f59b0fd32b 100644
--- a/dnn/test/common/checker.h
+++ b/dnn/test/common/checker.h
@@ -414,34 +414,34 @@ TensorND TensorValue(const TensorShape& shape, T dtype,
 
 template <typename T, typename U>
 TensorND TensorValueLowbit4(const TensorShape& shape, T dtype,
-                            std::vector<U> values) {
+                                std::vector<U> values) {
     TensorND tensor;
     tensor.layout = {shape, dtype};
     tensor.raw_ptr =
             static_cast<dt_byte*>(malloc(tensor.layout.span().dist_byte()));
     megdnn_assert(values.size() == tensor.layout.total_nr_elems());
     auto ptr = tensor.ptr<typename DTypeTrait<T>::ctype>();
-    size_t i;
-    for (i = 0; i + 1 < values.size(); i += 2) {
-        U val0 = values[i], val1 = values[i + 1];
-        megdnn_assert(val0 >= DTypeTrait<T>::min());
-        megdnn_assert(val1 <= DTypeTrait<T>::max());
-        ptr[i / 2] = typename DTypeTrait<T>::ctype((val0 & 0xF) | (val1 << 4));
-    }
-    if (i < values.size()) {
-        U val0 = values[i];
-        megdnn_assert(val0 >= DTypeTrait<T>::min() &&
-                      val0 <= DTypeTrait<T>::max());
-        if (i + 1 < values.size()) {
-            U val1 = values[i + 1];
-            megdnn_assert(val1 >= DTypeTrait<T>::min() &&
-                          val1 <= DTypeTrait<T>::max());
-            ptr[i / 2] = typename DTypeTrait<T>::ctype((val0 & 0xF) | (val1 << 4));
-        } else {
-            ptr[i / 2] = typename DTypeTrait<T>::ctype(val0 & 0xF);
+    auto layout = tensor.layout;
+    auto dim_in = shape[layout.ndim - 1];
+    auto elems = tensor.layout.total_nr_elems();
+    auto dim_out = elems / dim_in;
+    auto stride_out = div_ceil(dim_in, 2_z);
+    size_t in_offset = 0;
+    for (size_t i = 0; i < dim_out; ++i) {
+        for (size_t j = 0; j < dim_in; j += 2) {
+            U a = values[in_offset + j];
+            U b = 0;
+            if (j + 1 < dim_in)
+                b = values[in_offset + j + 1];
+            megdnn_assert(a >= DTypeTrait<T>::min());
+            megdnn_assert(a <= DTypeTrait<T>::max());
+            megdnn_assert(b >= DTypeTrait<T>::min());
+            megdnn_assert(b <= DTypeTrait<T>::max());
+            ptr[j / 2] = (a & 0xF) | (b << 4);
         }
+        in_offset += dim_in;
+        ptr += stride_out;
     }
-
     return tensor;
 }
 
diff --git a/dnn/test/cuda/pooling.cpp b/dnn/test/cuda/pooling.cpp
index 8fcf9f5abe51f0f3f700be6f998b0252c88d3bcd..a98b8028f51a1b2d9c25281f989b8767ff14ee51 100644
--- a/dnn/test/cuda/pooling.cpp
+++ b/dnn/test/cuda/pooling.cpp
@@ -242,6 +242,20 @@ TEST_F(CUDA, POOLING_BACKWARD)
                 .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout});
     }
 }
+TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) {
+    require_compute_capability(7, 5);
+    using Param = param::Pooling;
+    Checker<Pooling> checker(handle_cuda());
+    Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2};
+    checker.set_dtype(0, dtype::QuantizedS4(0.1f));
+    param.format = Param::Format::NCHW;
+    checker.set_epsilon(1 + 1e-3);
+    checker.set_param(param).exec({{20, 64, 22, 33}, {}});
+    param.mode = Param::Mode::AVERAGE;
+    checker.set_param(param).exec({{20, 64, 22, 33}, {}});
+    param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
+    checker.set_param(param).exec({{20, 64, 22, 33}, {}});
+}
 
 TEST_F(CUDA, POOLING_FORWARD_NCHW4) {
     require_compute_capability(7, 5);
@@ -252,6 +266,10 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW4) {
     param.format = Param::Format::NCHW4;
     checker.set_epsilon(1 + 1e-3);
     checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}});
+    param.mode = Param::Mode::AVERAGE;
+    checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}});
+    param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
+    checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}});
 }
 
 #if CUDNN_VERSION >= 7500
@@ -267,9 +285,29 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW32) {
     param.format = Param::Format::NCHW32;
     checker.set_epsilon(1e-3).set_rng(0, &int_rng);
     checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}});
+    param.mode = Param::Mode::AVERAGE;
+    checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}});
+    param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
+    checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}});
 }
 #endif
 
+TEST_F(CUDA, POOLING_FORWARD_NCHW64) {
+    require_compute_capability(7, 5);
+    using Param = param::Pooling;
+    Checker<Pooling> checker(handle_cuda());
+    Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2};
+    UniformIntRNG int_rng{-8, 7};
+    checker.set_dtype(0, dtype::QuantizedS4(1.f));
+    param.format = Param::Format::NCHW64;
+    checker.set_epsilon(1e-3).set_rng(0, &int_rng);
+    checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}});
+    param.mode = Param::Mode::AVERAGE;
+    checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}});
+    param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
+    checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}});
+}
+
 TEST_F(CUDA, POOLING_FORWARD_CHWN4) {
     require_compute_capability(6, 1);
     using Param = param::Pooling;
diff --git a/dnn/test/naive/pooling.cpp b/dnn/test/naive/pooling.cpp
index 803b9c16b71e0146acdae2f4ab067bbbdefd6150..e8208ab901c66cc19bd54656dfc5d8a22246b32f 100644
--- a/dnn/test/naive/pooling.cpp
+++ b/dnn/test/naive/pooling.cpp
@@ -50,4 +50,63 @@ TEST_F(NAIVE, POOLING_QUANTIZED) {
                                            12306, 23333})});
 }
 
+TEST_F(NAIVE, POOLING_QUANTIZED_Q4) {
+    using Mode = Pooling::Param::Mode;
+
+    Checker<Pooling> checker(handle(), /* check_dispatch */ false);
+
+    {
+        auto q4_dt = dtype::QuantizedS4(1.f);
+        std::vector<int> i8_src_vec{1, 2, 3, 
+                                    4, 5, 6, 
+                                    7, -1, -2};
+        std::vector<int> i8_max_dst_vec{1, 3, 7, 6};
+
+        std::vector<int> i8_avg_dst_vec{0, 1, 3, 2};
+        std::vector<int> i8_avg_exclu_dst_vec{1, 3, 6, 2};
+        Pooling::Param param{Mode::MAX, 1, 1, 2, 2, 2, 2};
+        Testcase input{TensorValueLowbit4({1, 1, 3, 3}, q4_dt, i8_src_vec), {}};
+        
+        checker.set_param(param).exect(
+                input, Testcase{{},
+                                TensorValueLowbit4({1, 1, 2, 2}, q4_dt,
+                                                   i8_max_dst_vec)});
+        param = {Mode::AVERAGE, 1, 1, 2, 2, 2, 2};
+        checker.set_param(param).exect(
+                input, Testcase{{},
+                                TensorValueLowbit4({1, 1, 2, 2}, q4_dt,
+                                                   i8_avg_dst_vec)});
+        param = {Mode::AVERAGE_COUNT_EXCLUDE_PADDING, 1, 1, 2, 2, 2, 2};
+        checker.set_param(param).exect(
+                input, Testcase{{},
+                                TensorValueLowbit4({1, 1, 2, 2}, q4_dt,
+                                                   i8_avg_exclu_dst_vec)});
+    }
+
+    {
+        auto u4_dt = dtype::Quantized4Asymm(1.f, 0);
+        std::vector<int> u8_src_vec{1, 2, 3, 
+                                    4, 5, 6, 
+                                    7, 8, 9};
+        std::vector<int> u8_max_dst_vec{1, 3, 7, 9};
+        std::vector<int> u8_avg_dst_vec{0, 1, 3, 7};
+        std::vector<int> u8_avg_exclu_dst_vec{1, 3, 6, 7};
+        Pooling::Param param{Mode::MAX, 1, 1, 2, 2, 2, 2};
+        Testcase input{TensorValueLowbit4({1, 1, 3, 3}, u4_dt, u8_src_vec), {}};
+        checker.set_param(param).exect(
+                input, Testcase{{},
+                                TensorValueLowbit4({1, 1, 2, 2}, u4_dt,
+                                                   u8_max_dst_vec)});
+        param = {Mode::AVERAGE, 1, 1, 2, 2, 2, 2};
+        checker.set_param(param).exect(
+                input, Testcase{{},
+                                TensorValueLowbit4({1, 1, 2, 2}, u4_dt,
+                                                   u8_avg_dst_vec)});
+        param = {Mode::AVERAGE_COUNT_EXCLUDE_PADDING, 1, 1, 2, 2, 2, 2};
+        checker.set_param(param).exect(
+                input, Testcase{{},
+                                TensorValueLowbit4({1, 1, 2, 2}, u4_dt,
+                                                   u8_avg_exclu_dst_vec)});
+    }
+}
 // vim: syntax=cpp.doxygen