From 8da2f698a37dd9a6dfc09efb7c190cd468a4fffa Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Mon, 12 Apr 2021 19:48:33 +0800
Subject: [PATCH] feat(dnn/cuda): support warp perspective/pooling op when
 channel not aligned to 64

GitOrigin-RevId: 39f29ec990ebb4c61e8d217911976b51810df34c
---
 dnn/src/common/tensor_format.cpp              |   2 +-
 dnn/src/cuda/elemwise_helper.cuh              |   5 +-
 dnn/src/cuda/elemwise_multi_type/kern_ops.cuh |  49 ++--
 dnn/src/cuda/integer_subbyte_utils.cuh        | 146 ++++++++++
 dnn/src/cuda/memory_utils.cuh                 | 253 ++++++++++++++++++
 dnn/src/cuda/relayout/opr_impl.cpp            |   8 +-
 dnn/src/cuda/relayout/param_visitor.cuh       |  28 +-
 dnn/src/cuda/relayout_format/helper.cuh       | 103 +++++++
 .../cuda/relayout_format/relayout_format.cu   | 140 +++++-----
 dnn/src/cuda/type_cvt/kern.cu                 |  10 +-
 dnn/src/cuda/utils.cuh                        |  99 -------
 dnn/src/cuda/warp_perspective/forward.cpp     |   1 +
 dnn/src/cuda/warp_perspective/forward.cu      |   2 +
 dnn/test/cuda/pooling.cpp                     |   7 +-
 dnn/test/cuda/type_cvt.cpp                    |   2 +-
 dnn/test/cuda/warp_perspective.cpp            |   4 +-
 src/gopt/impl/tensor_reformat.cpp             |  24 +-
 src/gopt/test/inference.cpp                   | 189 ++++++-------
 src/opr/impl/imgproc.cpp                      |   3 +-
 src/opr/test/dnn/convolution.cpp              |  31 ++-
 src/plugin/impl/opr_footprint.cpp             |  20 +-
 test/src/helper.cpp                           |  32 ++-
 test/src/include/megbrain/test/helper.h       |   9 +
 23 files changed, 812 insertions(+), 355 deletions(-)
 create mode 100644 dnn/src/cuda/integer_subbyte_utils.cuh
 create mode 100644 dnn/src/cuda/memory_utils.cuh
diff --git a/dnn/src/common/tensor_format.cpp b/dnn/src/common/tensor_format.cpp
index 4e5a1c0d4..2ed854cdf 100644
--- a/dnn/src/common/tensor_format.cpp
+++ b/dnn/src/common/tensor_format.cpp
@@ -568,7 +568,7 @@ TensorLayout LowbitsAlignedTensorFormatBase::collapse_contiguous_spec(
             res.stride[0] = 1;
             return res;
         }
-        if (res.shape[i] == 1) {
+        if (res.shape[i] == 1 && res.stride[i] != 1) {
             res.remove_axis_inplace(i);
         }
     }
diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh
index 3b62312b0..5424389bc 100644
--- a/dnn/src/cuda/elemwise_helper.cuh
+++ b/dnn/src/cuda/elemwise_helper.cuh
@@ -16,6 +16,7 @@
 #include "src/cuda/int_fastdiv.cuh"
 #include "src/cuda/query_blocksize.cuh"
 #include "src/cuda/utils.cuh"
+#include "src/cuda/integer_subbyte_utils.cuh"
 
 /*
  * please note that all arithmetics on GPU are 32-bit for best performance; this
@@ -633,7 +634,7 @@ public:
         int vec_idx = offset_ >> 1;
         int lane_idx = offset_ & 0x1;
 
-        Storage item = Storage(unpack_integer_4bits<true>(
+        Storage item = Storage(integer_subbyte::unpack_integer_4bits<true>(
                 *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4));
 
         dt_qint4 result(item);
@@ -664,7 +665,7 @@ public:
         int vec_idx = offset_ >> 1;
         int lane_idx = offset_ & 0x1;
 
-        Storage item = Storage(unpack_integer_4bits<false>(
+        Storage item = Storage(integer_subbyte::unpack_integer_4bits<false>(
                 *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4));
 
         dt_quint4 result(item);
diff --git a/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh b/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh
index 658f12d89..9386f4c70 100644
--- a/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh
+++ b/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh
@@ -15,6 +15,7 @@
 #include "src/cuda/elemwise_helper_q4.cuh"
 #include "src/cuda/elemwise_multi_type/kern.cuh"
 #include "src/cuda/utils.cuh"
+#include "src/cuda/integer_subbyte_utils.cuh"
 
 namespace megdnn {
 namespace cuda {
@@ -380,10 +381,10 @@ struct QuantizedMultiTypeOp<
     }
 
     __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a) {
-        dst_storage x = apply(
-                src_storage(unpack_integer_4bits<src_signedness>(a.x, 0)));
-        dst_storage y = apply(
-                src_storage(unpack_integer_4bits<src_signedness>(a.x, 4)));
+        dst_storage x = apply(src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0)));
+        dst_storage y = apply(src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4)));
 
         *(dst_vect_type*)(&dst[idx]) =
                 elemwise_intl::VectTypeTrait<ctype_dst>::make_vector(x, y);
@@ -470,14 +471,14 @@ struct QuantizedMultiTypeOp<
 
     __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a,
                                                src_vect_type b) {
-        src_storage a_x =
-                src_storage(unpack_integer_4bits<src_signedness>(a.x, 0));
-        src_storage a_y =
-                src_storage(unpack_integer_4bits<src_signedness>(a.x, 4));
-        src_storage b_x =
-                src_storage(unpack_integer_4bits<src_signedness>(b.x, 0));
-        src_storage b_y =
-                src_storage(unpack_integer_4bits<src_signedness>(b.x, 4));
+        src_storage a_x = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0));
+        src_storage a_y = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4));
+        src_storage b_x = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 0));
+        src_storage b_y = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 4));
 
         dst_storage x = apply(a_x, b_x), y = apply(a_y, b_y);
 
@@ -572,18 +573,18 @@ struct QuantizedMultiTypeOp<
     __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a,
                                                src_vect_type b,
                                                src_vect_type c) {
-        src_storage a_x =
-                src_storage(unpack_integer_4bits<src_signedness>(a.x, 0));
-        src_storage a_y =
-                src_storage(unpack_integer_4bits<src_signedness>(a.x, 4));
-        src_storage b_x =
-                src_storage(unpack_integer_4bits<src_signedness>(b.x, 0));
-        src_storage b_y =
-                src_storage(unpack_integer_4bits<src_signedness>(b.x, 4));
-        src_storage c_x =
-                src_storage(unpack_integer_4bits<src_signedness>(c.x, 0));
-        src_storage c_y =
-                src_storage(unpack_integer_4bits<src_signedness>(c.x, 4));
+        src_storage a_x = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 0));
+        src_storage a_y = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(a.x, 4));
+        src_storage b_x = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 0));
+        src_storage b_y = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(b.x, 4));
+        src_storage c_x = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(c.x, 0));
+        src_storage c_y = src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(c.x, 4));
 
         dst_storage x = apply(a_x, b_x, c_x), y = apply(a_y, b_y, c_y);
 
diff --git a/dnn/src/cuda/integer_subbyte_utils.cuh b/dnn/src/cuda/integer_subbyte_utils.cuh
new file mode 100644
index 000000000..d9a80ac8a
--- /dev/null
+++ b/dnn/src/cuda/integer_subbyte_utils.cuh
@@ -0,0 +1,146 @@
+/**
+ * \file dnn/src/cuda/integer_subbyte_utils.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#if MEGDNN_CC_CUDA
+#pragma once
+#include "src/cuda/utils.cuh"
+
+namespace megdnn {
+namespace cuda {
+namespace integer_subbyte {
+template <bool signedness>
+struct integer_trait;
+
+template <>
+struct integer_trait<true> {
+    using type = int;
+};
+
+template <>
+struct integer_trait<false> {
+    using type = unsigned;
+};
+
+MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8(
+        int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) {
+    unsigned out;
+#if __CUDA_ARCH__ >= 750 &&             \
+        ((__CUDACC_VER_MAJOR__ > 10) || \
+         ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+    asm volatile(
+            "{ .reg .u32 r4;"
+            "cvt.pack.sat.s4.s32.b32    r4, %8, %7, 0;"
+            "cvt.pack.sat.s4.s32.b32    r4, %6, %5, r4;"
+            "cvt.pack.sat.s4.s32.b32    r4, %4, %3, r4;"
+            "cvt.pack.sat.s4.s32.b32    %0, %2, %1, r4;"
+            "}"
+            : "=r"(out)
+            : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6),
+              "r"(s7));
+#else
+#define CVT_SAT_S4_S32(r, bits) \
+    r = r <= -8 ? -8 : r;       \
+    r = r > 7 ? 7 : r;          \
+    r = (((unsigned)r & 0xf) << bits);
+    CVT_SAT_S4_S32(s0, 0)
+    CVT_SAT_S4_S32(s1, 4)
+    CVT_SAT_S4_S32(s2, 8)
+    CVT_SAT_S4_S32(s3, 12)
+    CVT_SAT_S4_S32(s4, 16)
+    CVT_SAT_S4_S32(s5, 20)
+    CVT_SAT_S4_S32(s6, 24)
+    CVT_SAT_S4_S32(s7, 28)
+    out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
+#undef CVT_SAT_S4_S32
+#endif
+    return reinterpret_cast<int const&>(out);
+}
+
+MEGDNN_DEVICE __forceinline__ static int transform_int8_to_uint4x8(
+        int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) {
+    unsigned out;
+#if __CUDA_ARCH__ >= 750 &&             \
+        ((__CUDACC_VER_MAJOR__ > 10) || \
+         ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
+    asm volatile(
+            "{ .reg .u32 r4;"
+            "cvt.pack.sat.u4.s32.b32    r4, %8, %7, 0;"
+            "cvt.pack.sat.u4.s32.b32    r4, %6, %5, r4;"
+            "cvt.pack.sat.u4.s32.b32    r4, %4, %3, r4;"
+            "cvt.pack.sat.u4.s32.b32    %0, %2, %1, r4;"
+            "}"
+            : "=r"(out)
+            : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6),
+              "r"(s7));
+#else
+#define CVT_SAT_U4_S32(r, bits) \
+    r = r <= 0 ? 0 : r;         \
+    r = r > 15 ? 15 : r;        \
+    r = (((unsigned)r & 0xf) << bits);
+    CVT_SAT_U4_S32(s0, 0)
+    CVT_SAT_U4_S32(s1, 4)
+    CVT_SAT_U4_S32(s2, 8)
+    CVT_SAT_U4_S32(s3, 12)
+    CVT_SAT_U4_S32(s4, 16)
+    CVT_SAT_U4_S32(s5, 20)
+    CVT_SAT_U4_S32(s6, 24)
+    CVT_SAT_U4_S32(s7, 28)
+    out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
+#undef CVT_SAT_U4_S32
+#endif
+    return reinterpret_cast<int const&>(out);
+}
+
+template <bool signedness, typename T>
+MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage,
+                                                              int bits) {
+    //! size in bits of 32 bit integer - 4 bits
+    static constexpr int shift = 28;
+    using type = typename integer_trait<signedness>::type;
+    unsigned intermediate = static_cast<unsigned>(storage);
+    type result = reinterpret_cast<type&>(intermediate);
+    return (result << (shift - bits)) >> shift;
+}
+
+MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8(
+        int (&result)[8], const int& source) {
+#pragma unroll
+    for (int i = 0; i < 8; i++) {
+        result[i] = unpack_integer_4bits<true>(
+                reinterpret_cast<unsigned const&>(source), (i << 2));
+    }
+}
+
+MEGDNN_DEVICE __forceinline__ static void transform_uint4x8_to_int8(
+        int (&result)[8], const int& source) {
+#pragma unroll
+    for (int i = 0; i < 8; i++) {
+        result[i] = unpack_integer_4bits<false>(
+                reinterpret_cast<unsigned const&>(source), (i << 2));
+    }
+}
+
+MEGDNN_DEVICE __forceinline__ static void transform_int4x2_to_int8(
+        int (&result)[2], const uint8_t& source) {
+    result[0] = unpack_integer_4bits<true>(source, 0);
+    result[1] = unpack_integer_4bits<true>(source, 4);
+}
+
+MEGDNN_DEVICE __forceinline__ static void transform_uint4x2_to_int8(
+        int (&result)[2], const uint8_t& source) {
+    result[0] = unpack_integer_4bits<false>(source, 0);
+    result[1] = unpack_integer_4bits<false>(source, 4);
+}
+}  // namespace integer_subbyte
+}  // namespace cuda
+}  // namespace megdnn
+#endif
+// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/cuda/memory_utils.cuh b/dnn/src/cuda/memory_utils.cuh
new file mode 100644
index 000000000..82c718ed3
--- /dev/null
+++ b/dnn/src/cuda/memory_utils.cuh
@@ -0,0 +1,253 @@
+/**
+ * \file dnn/src/cuda/memory_utils.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#if MEGDNN_CC_CUDA
+#pragma once
+#include "src/cuda/utils.cuh"
+
+namespace megdnn {
+namespace cuda {
+namespace memory {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <typename AccessType, int LoadBytes>
+struct global_load;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// The redundant mov PTX instruction is used to enforce the compiler to
+// initialize data to zero before ld.global
+template <typename AccessType>
+struct global_load<AccessType, 32> {
+    MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
+                                              bool pred_guard, int val = 0) {
+        uint4* data = reinterpret_cast<uint4*>(&D);
+
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %9, 0;\n"
+                "  mov.b32 %0, %10;\n"
+                "  mov.b32 %1, %10;\n"
+                "  mov.b32 %2, %10;\n"
+                "  mov.b32 %3, %10;\n"
+                "  mov.b32 %4, %10;\n"
+                "  mov.b32 %5, %10;\n"
+                "  mov.b32 %6, %10;\n"
+                "  mov.b32 %7, %10;\n"
+                "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+                "  @p ld.global.v4.u32 {%4, %5, %6, %7}, [%11];\n"
+                "}\n"
+                : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z),
+                  "=r"(data[0].w), "=r"(data[1].x), "=r"(data[1].y),
+                  "=r"(data[1].z), "=r"(data[1].w)
+                : "l"(ptr), "r"((int)pred_guard),
+                  "r"(reinterpret_cast<unsigned&>(val)),
+                  "l"(((uint8_t*)ptr) + 16));
+    }
+};
+
+template <typename AccessType>
+struct global_load<AccessType, 16> {
+    MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
+                                              bool pred_guard, int val) {
+        uint4& data = reinterpret_cast<uint4&>(D);
+
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %5, 0;\n"
+                "  mov.b32 %0, %6;\n"
+                "  mov.b32 %1, %6;\n"
+                "  mov.b32 %2, %6;\n"
+                "  mov.b32 %3, %6;\n"
+                "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                "}\n"
+                : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                : "l"(ptr), "r"((int)pred_guard),
+                  "r"(reinterpret_cast<unsigned&>(val)));
+    }
+};
+
+template <typename AccessType>
+struct global_load<AccessType, 8> {
+    MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
+                                              bool pred_guard, int val) {
+        uint2& data = reinterpret_cast<uint2&>(D);
+
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %3, 0;\n"
+                "  mov.b32 %0, %4;\n"
+                "  mov.b32 %1, %4;\n"
+                "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+                "}\n"
+                : "=r"(data.x), "=r"(data.y)
+                : "l"(ptr), "r"((int)pred_guard),
+                  "r"(reinterpret_cast<unsigned&>(val)));
+    }
+};
+
+template <typename AccessType>
+struct global_load<AccessType, 4> {
+    MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
+                                              bool pred_guard, int val) {
+        unsigned& data = reinterpret_cast<unsigned&>(D);
+
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %2, 0;\n"
+                "  mov.b32 %0, %3;\n"
+                "  @p ld.global.u32 %0, [%1];\n"
+                "}\n"
+                : "=r"(data)
+                : "l"(ptr), "r"((int)pred_guard),
+                  "r"(reinterpret_cast<unsigned&>(val)));
+    }
+};
+
+template <typename AccessType>
+struct global_load<AccessType, 1> {
+    MEGDNN_DEVICE __forceinline__ global_load(AccessType& D, void const* ptr,
+                                              bool pred_guard, int val) {
+        if (pred_guard)
+            D = *(reinterpret_cast<AccessType const*>(ptr));
+        else {
+            unsigned uv = reinterpret_cast<unsigned&>(val);
+            uint8_t& data = reinterpret_cast<uint8_t&>(D);
+            data = uv & 0xff;
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+        /// Fragment type to store loaded data
+        typename AccessType,
+        /// The bytes of loading
+        int LoadBytes>
+struct global_store;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename AccessType>
+struct global_store<AccessType, 32> {
+    MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
+                                               bool pred_guard) {
+        uint4 const* data = reinterpret_cast<uint4 const*>(&D);
+
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %5, 0;\n"
+                "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
+                "}\n"
+                :
+                : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
+                  "r"(data[0].w), "r"((int)pred_guard),
+                  "l"(((uint8_t*)ptr) + 16), "r"(data[1].x), "r"(data[1].y),
+                  "r"(data[1].z), "r"(data[1].w));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 16> {
+    MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
+                                               bool pred_guard) {
+        uint4 const& data = reinterpret_cast<uint4 const&>(D);
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %5, 0;\n"
+                "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                "}\n"
+                :
+                : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w),
+                  "r"((int)pred_guard));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 8> {
+    MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
+                                               bool pred_guard) {
+        uint2 const& data = reinterpret_cast<uint2 const&>(D);
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %3, 0;\n"
+                "  @p st.global.v2.u32 [%0], {%1, %2};\n"
+                "}\n"
+                :
+                : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 4> {
+    MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
+                                               bool pred_guard) {
+        uint32_t const& data = reinterpret_cast<uint32_t const&>(D);
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %2, 0;\n"
+                "  @p st.global.u32 [%0], %1;\n"
+                "}\n"
+                :
+                : "l"(ptr), "r"(data), "r"((int)pred_guard));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 2> {
+    MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
+                                               bool pred_guard) {
+        uint16_t const& data = reinterpret_cast<uint16_t const&>(D);
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %2, 0;\n"
+                "  @p st.global.u16 [%0], %1;\n"
+                "}\n"
+                :
+                : "l"(ptr), "h"(data), "r"((int)pred_guard));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 1> {
+    MEGDNN_DEVICE __forceinline__ global_store(AccessType const& D, void* ptr,
+                                               bool pred_guard) {
+        if (pred_guard)
+            *(reinterpret_cast<AccessType*>(ptr)) = D;
+    }
+};
+
+}  // namespace memory
+}  // namespace cuda
+}  // namespace megdnn
+#endif
+
+// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
diff --git a/dnn/src/cuda/relayout/opr_impl.cpp b/dnn/src/cuda/relayout/opr_impl.cpp
index 64da13e7d..a55da9348 100644
--- a/dnn/src/cuda/relayout/opr_impl.cpp
+++ b/dnn/src/cuda/relayout/opr_impl.cpp
@@ -83,12 +83,7 @@ bool RelayoutForwardImpl::Param::try_copy_contig() {
         return false;
     if (lsrc.stride[0] != 1 || ldst.stride[0] != 1)
         return false;
-    size_t copy_size;
-    if (ldst.dtype.is_low_bit()) {
-        copy_size = ldst.access_bytes();
-    } else {
-        copy_size = ldst.total_nr_elems() * dtype_size();
-    }
+    size_t copy_size = ldst.span().dist_byte();
 
     cuda_check(cudaMemcpyAsync(m_dst.raw_ptr, m_src.raw_ptr, copy_size,
                                cudaMemcpyDeviceToDevice, m_opr->stream()));
@@ -191,7 +186,6 @@ bool RelayoutForwardImpl::Param::try_copy_last_contig() {
 }
 
 void RelayoutForwardImpl::Param::copy_general() {
-
     copy_noncontig_general(m_dst, m_src, m_opr->stream());
 }
 
diff --git a/dnn/src/cuda/relayout/param_visitor.cuh b/dnn/src/cuda/relayout/param_visitor.cuh
index a44fb1b4d..c33a27757 100644
--- a/dnn/src/cuda/relayout/param_visitor.cuh
+++ b/dnn/src/cuda/relayout/param_visitor.cuh
@@ -6,14 +6,15 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 
 #include "megdnn/basic_types.h"
 #include "src/cuda/int_fastdiv.cuh"
+#include "src/cuda/integer_subbyte_utils.cuh"
 #include "src/cuda/utils.cuh"
 
-
 #pragma once
 
 namespace megdnn {
@@ -56,13 +57,13 @@ y
 template <int ndim, typename ctype, ContigType contig_type>
 class ParamElemVisitor;
 #define PARAM_ELEM_VISITOR_COMMON_DEV      \
-    devfunc ctype *ptr() { return m_ptr; } \
-    devfunc ctype &at(uint32_t idx) { return m_ptr[offset(idx)]; }
+    devfunc ctype* ptr() { return m_ptr; } \
+    devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; }
 
 //! specialization for CONTIG_OTHER
 template <int ndim, typename ctype>
 class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> {
-    ctype *__restrict m_ptr;
+    ctype* __restrict m_ptr;
     int m_stride[ndim];
 
     //! m_shape_highdim[i] = original_shape[i + 1]
@@ -75,7 +76,7 @@ class ParamElemVisitor<ndim, ctype, CONTIG_OTHER> {
 public:
     static const int NDIM = ndim;
 
-    void host_init(const TensorND &rv, int grid_size, int block_size);
+    void host_init(const TensorND& rv, int grid_size, int block_size);
 
 #if MEGDNN_CC_CUDA
     devfunc void thread_init(uint32_t) {}
@@ -86,7 +87,7 @@ public:
         int offset = 0;
 #pragma unroll
         for (int i = ndim - 1; i >= 1; --i) {
-            Uint32Fastdiv &shp = m_shape_highdim[i - 1];
+            Uint32Fastdiv& shp = m_shape_highdim[i - 1];
             uint32_t idx_div = idx / shp;
             offset += (idx - idx_div * shp.divisor()) * m_stride[i];
             idx = idx_div;
@@ -102,12 +103,12 @@ public:
 //! specialization for CONTIG_FULL
 template <int ndim, typename ctype>
 class ParamElemVisitor<ndim, ctype, CONTIG_FULL> {
-    ctype *__restrict m_ptr;
+    ctype* __restrict m_ptr;
 
 public:
     static const int NDIM = ndim;
 
-    void host_init(const TensorND &rv, int grid_size, int block_size);
+    void host_init(const TensorND& rv, int grid_size, int block_size);
 
 #if MEGDNN_CC_CUDA
     devfunc void thread_init(uint32_t) {}
@@ -126,7 +127,6 @@ template <int ndim>
 class ParamElemVisitor<ndim, dt_quint4, CONTIG_OTHER> {
     using Storage = uint8_t;
 
-protected:
     Storage* __restrict m_ptr;
     int m_stride[ndim];
     int m_shape[ndim];
@@ -205,7 +205,6 @@ public:
             for (int i = 0; i < ndim; ++i) {
                 valid &= (shape_idx[i] < m_shape[i]);
             }
-#pragma unroll
             for (int i = 0; i < ndim - 1; ++i) {
                 idx = (idx + shape_idx[i]) * m_shape[i + 1];
             }
@@ -213,7 +212,6 @@ public:
         }
         return idx;
     }
-
     devfunc Storage* ptr() { return m_ptr; }
 
     devfunc Storage at(uint32_t idx) {
@@ -221,7 +219,7 @@ public:
         int vec_idx = offset_ >> 1;
         int lane_idx = offset_ & 0x1;
 
-        Storage item = Storage(unpack_integer_4bits<false>(
+        Storage item = Storage(integer_subbyte::unpack_integer_4bits<false>(
                 *(Storage*)&m_ptr[vec_idx], lane_idx * 4));
 
         return item;
@@ -235,7 +233,7 @@ public:
 #endif
 };
 
-} // namespace cuda
-} // namespace megdnn
+}  // namespace cuda
+}  // namespace megdnn
 
 // vim: ft=cpp syntax=cpp.doxygen
diff --git a/dnn/src/cuda/relayout_format/helper.cuh b/dnn/src/cuda/relayout_format/helper.cuh
index f77c7cc9d..69d98b70f 100644
--- a/dnn/src/cuda/relayout_format/helper.cuh
+++ b/dnn/src/cuda/relayout_format/helper.cuh
@@ -143,6 +143,109 @@ struct global_load_with_zero_point<AccessType, 1> {
     }
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+template <
+        /// Fragment type to store loaded data
+        typename AccessType,
+        /// The bytes of loading
+        int LoadBytes>
+struct global_store;
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+//
+// Specializations
+//
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename AccessType>
+struct global_store<AccessType, 32> {
+    devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
+        uint4 const* data = reinterpret_cast<uint4 const*>(&D);
+
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %5, 0;\n"
+                "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
+                "}\n"
+                :
+                : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
+                  "r"(data[0].w), "r"((int)pred_guard),
+                  "l"(((uint8_t*)ptr) + 16), "r"(data[1].x), "r"(data[1].y),
+                  "r"(data[1].z), "r"(data[1].w));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 16> {
+    devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
+        uint4 const& data = reinterpret_cast<uint4 const&>(D);
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %5, 0;\n"
+                "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                "}\n"
+                :
+                : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w),
+                  "r"((int)pred_guard));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 8> {
+    devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
+        uint2 const& data = reinterpret_cast<uint2 const&>(D);
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %3, 0;\n"
+                "  @p st.global.v2.u32 [%0], {%1, %2};\n"
+                "}\n"
+                :
+                : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 4> {
+    devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
+        uint32_t const& data = reinterpret_cast<uint32_t const&>(D);
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %2, 0;\n"
+                "  @p st.global.u32 [%0], %1;\n"
+                "}\n"
+                :
+                : "l"(ptr), "r"(data), "r"((int)pred_guard));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 2> {
+    devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
+        uint16_t const& data = reinterpret_cast<uint16_t const&>(D);
+        asm volatile(
+                "{\n"
+                "  .reg .pred p;\n"
+                "  setp.ne.b32 p, %2, 0;\n"
+                "  @p st.global.u16 [%0], %1;\n"
+                "}\n"
+                :
+                : "l"(ptr), "h"(data), "r"((int)pred_guard));
+    }
+};
+
+template <typename AccessType>
+struct global_store<AccessType, 1> {
+    devfunc global_store(AccessType const& D, void* ptr, bool pred_guard) {
+        if (pred_guard)
+            *(reinterpret_cast<AccessType*>(ptr)) = D;
+    }
+};
+
 #undef devfunc
 }  // namespace relayout_format
 }  // namespace cuda
diff --git a/dnn/src/cuda/relayout_format/relayout_format.cu b/dnn/src/cuda/relayout_format/relayout_format.cu
index c28d5ffcf..9c7b7b9f7 100644
--- a/dnn/src/cuda/relayout_format/relayout_format.cu
+++ b/dnn/src/cuda/relayout_format/relayout_format.cu
@@ -10,17 +10,14 @@
  * implied.
  */
 
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-parameter"
-#pragma GCC diagnostic ignored "-Wstrict-aliasing"
-#include "cutlass/fast_math.h"
-#include "cutlass/arch/memory.h"
-#pragma GCC diagnostic pop
+#include "src/cuda/int_fastdiv.cuh"
 #include "src/cuda/query_blocksize.cuh"
 #include "src/cuda/relayout_format/relayout_format.cuh"
-#include "src/cuda/relayout_format/helper.cuh"
+#include "src/cuda/integer_subbyte_utils.cuh"
+#include "src/cuda/memory_utils.cuh"
 using namespace megdnn;
 using namespace cuda;
+using namespace integer_subbyte;
 
 namespace {
 
@@ -322,26 +319,34 @@ struct Translayout<2, 64, SrcType, dtype::QuantizedS4, dtype::QuantizedS4,
         int* dst_frag = reinterpret_cast<int*>(dst_width);
 #pragma unroll
         for (int i = 0; i < 64; i += 8) {
-#define unpack_int4x2(_idx)                                         \
-    intermediate[_idx][0] = unpack_integer_4bits<true>(             \
-            reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 0); \
-    intermediate[_idx][1] = unpack_integer_4bits<true>(             \
-            reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 4);
-            // clang-format off
-            unpack_int4x2(0)
-            unpack_int4x2(1)
-            unpack_int4x2(2)
-            unpack_int4x2(3)
-            unpack_int4x2(4)
-            unpack_int4x2(5)
-            unpack_int4x2(6)
-            unpack_int4x2(7)
-            // clang-format on
+            transform_int4x2_to_int8(
+                    intermediate[0],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 0]));
+            transform_int4x2_to_int8(
+                    intermediate[1],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 1]));
+            transform_int4x2_to_int8(
+                    intermediate[2],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 2]));
+            transform_int4x2_to_int8(
+                    intermediate[3],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 3]));
+            transform_int4x2_to_int8(
+                    intermediate[4],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 4]));
+            transform_int4x2_to_int8(
+                    intermediate[5],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 5]));
+            transform_int4x2_to_int8(
+                    intermediate[6],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 6]));
+            transform_int4x2_to_int8(
+                    intermediate[7],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 7]));
 
             int frag_idx = i / 8;
             dst_frag[0 * 8 + frag_idx] = pack_channel(0);
             dst_frag[1 * 8 + frag_idx] = pack_channel(1);
-#undef unpack_int4x2
         }
     }
     using Fragment = array_wrapper<SrcType, 64>;
@@ -429,26 +434,34 @@ struct Translayout<2, 64, SrcType, dtype::Quantized4Asymm,
         int* dst_frag = reinterpret_cast<int*>(dst_width);
 #pragma unroll
         for (int i = 0; i < 64; i += 8) {
-#define unpack_int4x2(_idx)                                         \
-    intermediate[_idx][0] = unpack_integer_4bits<false>(            \
-            reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 0); \
-    intermediate[_idx][1] = unpack_integer_4bits<false>(            \
-            reinterpret_cast<uint8_t&>(read_channel[i + _idx]), 4);
-            // clang-format off
-            unpack_int4x2(0)
-            unpack_int4x2(1)
-            unpack_int4x2(2)
-            unpack_int4x2(3)
-            unpack_int4x2(4)
-            unpack_int4x2(5)
-            unpack_int4x2(6)
-            unpack_int4x2(7)
-            // clang-format on
+            transform_uint4x2_to_int8(
+                    intermediate[0],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 0]));
+            transform_uint4x2_to_int8(
+                    intermediate[1],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 1]));
+            transform_uint4x2_to_int8(
+                    intermediate[2],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 2]));
+            transform_uint4x2_to_int8(
+                    intermediate[3],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 3]));
+            transform_uint4x2_to_int8(
+                    intermediate[4],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 4]));
+            transform_uint4x2_to_int8(
+                    intermediate[5],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 5]));
+            transform_uint4x2_to_int8(
+                    intermediate[6],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 6]));
+            transform_uint4x2_to_int8(
+                    intermediate[7],
+                    reinterpret_cast<uint8_t&>(read_channel[i + 7]));
 
             int frag_idx = i / 8;
             dst_frag[0 * 8 + frag_idx] = pack_channel(0);
             dst_frag[1 * 8 + frag_idx] = pack_channel(1);
-#undef unpack_int4x2
         }
     }
     using Fragment = array_wrapper<SrcType, 64>;
@@ -744,6 +757,16 @@ inline __device__ int4 make_zero_pad<int4>(const uint8_t zero_point) {
     return {zero_point, zero_point, zero_point, zero_point};
 }
 
+template <int size_nbits>
+inline __device__ int make_zero(int zero_point);
+
+template <>
+inline __device__ int make_zero<4>(int zero_point) {
+    return transform_int8_to_uint4x8(zero_point, zero_point, zero_point,
+                                     zero_point, zero_point, zero_point,
+                                     zero_point, zero_point);
+}
+
 template <typename DstDtype>
 inline __device__ void write_helper(DstDtype* ptr, DstDtype val) {
     *ptr = val;
@@ -1062,11 +1085,11 @@ public:
     using AccessType = array_wrapper<Type, pack_size_in_type>;
     using Fragment = array_wrapper<Type, elements_in_type>;
 
-    MEGDNN_DEVICE TensorIteratorOverChannel()
+    MEGDNN_HOST TensorIteratorOverChannel()
             : pointer{nullptr}, chan_stride_in_elements{0}, channel{0} {}
-    MEGDNN_DEVICE TensorIteratorOverChannel(Type* pointer_,
-                                            int chan_stride_in_elements_,
-                                            int channel_, int, int)
+    MEGDNN_HOST TensorIteratorOverChannel(Type* pointer_,
+                                          int chan_stride_in_elements_,
+                                          int channel_, int, int)
             : pointer{pointer_},
               chan_stride_in_elements{chan_stride_in_elements_},
               channel{channel_} {}
@@ -1093,8 +1116,7 @@ public:
                                        (lane_size_in_type / pack_size_in_type) +
                                j;
                 bool guard = i < channel;
-                relayout_format::global_load_with_zero_point<AccessType,
-                                                             pack_size_in_byte>(
+                memory::global_load<AccessType, pack_size_in_byte>(
                         frag_ptr[frag_idx],
                         reinterpret_cast<void*>(pointer_ +
                                                 j * pack_size_in_type),
@@ -1115,7 +1137,7 @@ public:
                                        (lane_size_in_type / pack_size_in_type) +
                                j;
                 bool guard = i < channel;
-                cutlass::arch::global_store<AccessType, pack_size_in_byte>(
+                memory::global_store<AccessType, pack_size_in_byte>(
                         frag_ptr[frag_idx],
                         reinterpret_cast<void*>(pointer_ +
                                                 j * pack_size_in_type),
@@ -1160,20 +1182,18 @@ public:
     using AccessType = array_wrapper<Type, pack_size_in_type>;
     using Fragment = array_wrapper<Type, elements_in_type>;
 
-    MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel()
+    MEGDNN_HOST MaskedTensorIteratorOverChannel()
             : pointer{nullptr},
               chan_stride_in_elements{0},
               channel{0} {}
-    MEGDNN_HOST MEGDNN_DEVICE MaskedTensorIteratorOverChannel(
+    MEGDNN_HOST MaskedTensorIteratorOverChannel(
             Type* pointer_, int chan_stride_in_elements_, int channel_,
             int bound_, int div_)
             : pointer{pointer_},
               chan_stride_in_elements{chan_stride_in_elements_},
               channel{channel_},
               bound{bound_},
-              div{div_} {
-        cutlass::find_divisor(mul, shr, div);
-    }
+              div{uint32_t(div_)} {}
 
     MEGDNN_DEVICE __forceinline__ void initialize(int c_idx, int hw_idx) {
         pointer += (c_idx / pack_size) * chan_stride_in_elements;
@@ -1187,8 +1207,8 @@ public:
 #pragma unroll
             for (int j = 0; j < lane_size_in_type / pack_size_in_type; j++) {
                 int offset = hw_idx + j;
-                int h, w;
-                cutlass::fast_divmod(h, w, offset, div, mul, shr);
+                int h = (int)((uint32_t)(offset) / div);
+                int w = (int)((uint32_t)(offset) % div);
                 bool guard = (i < channel) && (w < bound);
                 int index = (i / pack_size) *
                                     (lane_size_in_type / pack_size_in_type) +
@@ -1219,8 +1239,7 @@ public:
                 int mask_index = (frag_idx >> 5);
                 int mask_shift = (frag_idx & 0x1f);
                 bool guard = (mask[mask_index] & (1 << mask_shift));
-                relayout_format::global_load_with_zero_point<AccessType,
-                                                             pack_size_in_byte>(
+                memory::global_load<AccessType, pack_size_in_byte>(
                         frag_ptr[frag_idx],
                         reinterpret_cast<void*>(pointer_ + stride[j]), guard,
                         zero_point);
@@ -1242,7 +1261,7 @@ public:
                 int mask_index = (frag_idx >> 5);
                 int mask_shift = (frag_idx & 0x1f);
                 bool guard = (mask[mask_index] & (1 << mask_shift));
-                cutlass::arch::global_store<AccessType, pack_size_in_byte>(
+                memory::global_store<AccessType, pack_size_in_byte>(
                         frag_ptr[frag_idx],
                         reinterpret_cast<void*>(pointer_ + stride[j]), guard);
             }
@@ -1260,9 +1279,7 @@ private:
     int chan_stride_in_elements;
     int channel;
     int bound;
-    int div;
-    uint32_t mul;
-    uint32_t shr;
+    Uint32Fastdiv div;
     uint32_t mask[mask_size];
     size_t stride[lane_size_in_type / pack_size_in_type];
 };
@@ -1355,8 +1372,7 @@ __global__ void relayout_kern(typename RelayoutProblem_::Param param) {
         param.dst_iterator.initialize(c_idx, hw_idx);
         typename SrcIterator::Fragment src_frag;
         typename DstIterator::Fragment dst_frag;
-        int zp = relayout_format::make_zero<SrcIterator::size_nbits>(
-                param.zero_point);
+        int zp = make_zero<SrcIterator::size_nbits>(param.zero_point);
         param.src_iterator.load(src_frag, zp);
         RelayoutProblem_::Transpose::trans(
                 reinterpret_cast<typename SrcIterator::Fragment&>(dst_frag),
@@ -1456,7 +1472,7 @@ void relayout_format::relayout_format_cuda_nchw_nchwx(
         megdnn_assert(src_layout.dtype.is_low_bit());
         int n = src.layout[0];
         int ic = src.layout[1];
-        int oc = dst.layout[1] * 64;
+        int oc = dst.layout[1] * pack_oc;
         int h = src.layout[2];
         // align to byte
         int w = src.layout[3];
diff --git a/dnn/src/cuda/type_cvt/kern.cu b/dnn/src/cuda/type_cvt/kern.cu
index 349770aec..46063ddba 100644
--- a/dnn/src/cuda/type_cvt/kern.cu
+++ b/dnn/src/cuda/type_cvt/kern.cu
@@ -223,10 +223,12 @@ struct TypeCvtOpFromQuantizedToQuantized4bit<
     }
     __device__ __forceinline__ void operator()(uint32_t idx,
                                                src_vect_type src) {
-        dst_storage x = apply(
-                src_storage(unpack_integer_4bits<src_signedness>(src.x, 0)));
-        dst_storage y = apply(
-                src_storage(unpack_integer_4bits<src_signedness>(src.x, 4)));
+        dst_storage x = apply(src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(src.x,
+                                                                      0)));
+        dst_storage y = apply(src_storage(
+                integer_subbyte::unpack_integer_4bits<src_signedness>(src.x,
+                                                                      4)));
 
         *(dst_vect_type*)(&dest[idx]) =
                 VectTypeTrait<ctype_dest>::make_vector(x, y);
diff --git a/dnn/src/cuda/utils.cuh b/dnn/src/cuda/utils.cuh
index 87983f808..5e6244091 100644
--- a/dnn/src/cuda/utils.cuh
+++ b/dnn/src/cuda/utils.cuh
@@ -21,7 +21,6 @@
 #include "cuda.h"
 #include "src/cuda/cudnn_with_check.h"
 #include "cutlass/cutlass.h"
-#include "cutlass/platform/platform.h"
 
 #define cuda_check(_x)                                       \
     do {                                                     \
@@ -376,104 +375,6 @@ MEGDNN_DEVICE __forceinline__ static float4 operator+(float4 lval,
                        lval.w + rval.w);
 }
 
-MEGDNN_DEVICE __forceinline__ static int transform_int8_to_int4x8(
-        int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) {
-    unsigned out;
-#if __CUDA_ARCH__ >= 750 &&             \
-        ((__CUDACC_VER_MAJOR__ > 10) || \
-         ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
-    asm volatile(
-            "{ .reg .u32 r4;"
-            "cvt.pack.sat.s4.s32.b32    r4, %8, %7, 0;"
-            "cvt.pack.sat.s4.s32.b32    r4, %6, %5, r4;"
-            "cvt.pack.sat.s4.s32.b32    r4, %4, %3, r4;"
-            "cvt.pack.sat.s4.s32.b32    %0, %2, %1, r4;"
-            "}"
-            : "=r"(out)
-            : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6),
-              "r"(s7));
-#else
-#define CVT_SAT_S4_S32(r, bits) \
-    r = r <= -8 ? -8 : r;       \
-    r = r > 7 ? 7 : r;          \
-    r = (((unsigned)r & 0xf) << bits);
-    CVT_SAT_S4_S32(s0, 0)
-    CVT_SAT_S4_S32(s1, 4)
-    CVT_SAT_S4_S32(s2, 8)
-    CVT_SAT_S4_S32(s3, 12)
-    CVT_SAT_S4_S32(s4, 16)
-    CVT_SAT_S4_S32(s5, 20)
-    CVT_SAT_S4_S32(s6, 24)
-    CVT_SAT_S4_S32(s7, 28)
-    out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
-#undef CVT_SAT_S4_S32
-#endif
-    return reinterpret_cast<int const&>(out);
-}
-
-MEGDNN_DEVICE __forceinline__ static int transform_int8_to_uint4x8(
-        int s0, int s1, int s2, int s3, int s4, int s5, int s6, int s7) {
-    unsigned out;
-#if __CUDA_ARCH__ >= 750 &&             \
-        ((__CUDACC_VER_MAJOR__ > 10) || \
-         ((__CUDACC_VER_MAJOR__ >= 10) && (__CUDACC_VER_MINOR__ >= 2)))
-    asm volatile(
-            "{ .reg .u32 r4;"
-            "cvt.pack.sat.u4.s32.b32    r4, %8, %7, 0;"
-            "cvt.pack.sat.u4.s32.b32    r4, %6, %5, r4;"
-            "cvt.pack.sat.u4.s32.b32    r4, %4, %3, r4;"
-            "cvt.pack.sat.u4.s32.b32    %0, %2, %1, r4;"
-            "}"
-            : "=r"(out)
-            : "r"(s0), "r"(s1), "r"(s2), "r"(s3), "r"(s4), "r"(s5), "r"(s6),
-              "r"(s7));
-#else
-#define CVT_SAT_U4_S32(r, bits) \
-    r = r <= 0 ? 0 : r;         \
-    r = r > 15 ? 15 : r;        \
-    r = (((unsigned)r & 0xf) << bits);
-    CVT_SAT_U4_S32(s0, 0)
-    CVT_SAT_U4_S32(s1, 4)
-    CVT_SAT_U4_S32(s2, 8)
-    CVT_SAT_U4_S32(s3, 12)
-    CVT_SAT_U4_S32(s4, 16)
-    CVT_SAT_U4_S32(s5, 20)
-    CVT_SAT_U4_S32(s6, 24)
-    CVT_SAT_U4_S32(s7, 28)
-    out = s0 + s1 + s2 + s3 + s4 + s5 + s6 + s7;
-#undef CVT_SAT_U4_S32
-#endif
-    return reinterpret_cast<int const&>(out);
-}
-
-template <bool signedness, typename T>
-MEGDNN_DEVICE __forceinline__ static int unpack_integer_4bits(T storage,
-                                                              int bits) {
-    static constexpr int shift = 28;
-    using type = typename cutlass::platform::conditional<signedness, int,
-                                                         unsigned>::type;
-    unsigned intermediate = static_cast<unsigned>(storage);
-    type result = reinterpret_cast<type&>(intermediate);
-    return (result << (shift - bits)) >> shift;
-}
-
-MEGDNN_DEVICE __forceinline__ static void transform_int4x8_to_int8(
-        int (&result)[8], const int& source) {
-#pragma unroll
-    for (int i = 0; i < 8; i++) {
-        result[i] = unpack_integer_4bits<true>(
-                reinterpret_cast<unsigned const&>(source), (i << 2));
-    }
-}
-
-MEGDNN_DEVICE __forceinline__ static void transform_uint4x8_to_int8(
-        int (&result)[8], const int& source) {
-#pragma unroll
-    for (int i = 0; i < 8; i++) {
-        result[i] = unpack_integer_4bits<false>(
-                reinterpret_cast<unsigned const&>(source), (i << 2));
-    }
-}
 #endif
 }  // namespace cuda
 }  // namespace megdnn
diff --git a/dnn/src/cuda/warp_perspective/forward.cpp b/dnn/src/cuda/warp_perspective/forward.cpp
index cffcb07ae..a1ad4f257 100644
--- a/dnn/src/cuda/warp_perspective/forward.cpp
+++ b/dnn/src/cuda/warp_perspective/forward.cpp
@@ -348,6 +348,7 @@ void WarpPerspectiveForwardImpl::exec(_megdnn_tensor_in ssrc,
                         RelayoutFormat::Param trans_param;
                         trans_param.mode =
                                 RelayoutFormat::Param::Mode::NCHW64_NCHW;
+                        trans_param.oc = sdst.layout[1]; 
                         relayout_opr->param() = trans_param;
                         relayout_opr->exec(dst, sdst, {});
                     }
diff --git a/dnn/src/cuda/warp_perspective/forward.cu b/dnn/src/cuda/warp_perspective/forward.cu
index 24b57ef6d..24cded33b 100644
--- a/dnn/src/cuda/warp_perspective/forward.cu
+++ b/dnn/src/cuda/warp_perspective/forward.cu
@@ -17,10 +17,12 @@
 #include "src/common/rounding_converter.cuh"
 #include "megdnn/dtype.h"
 #include <cstdio>
+#include "src/cuda/integer_subbyte_utils.cuh"
 
 using namespace megdnn;
 using namespace cuda;
 using namespace warp_perspective;
+using namespace integer_subbyte;
 
 namespace {
 
diff --git a/dnn/test/cuda/pooling.cpp b/dnn/test/cuda/pooling.cpp
index a98b8028f..033be709b 100644
--- a/dnn/test/cuda/pooling.cpp
+++ b/dnn/test/cuda/pooling.cpp
@@ -247,14 +247,13 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) {
     using Param = param::Pooling;
     Checker<Pooling> checker(handle_cuda());
     Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2};
-    checker.set_dtype(0, dtype::QuantizedS4(0.1f));
+    checker.set_dtype(0, dtype::QuantizedS4(3.1415926f));
     param.format = Param::Format::NCHW;
-    checker.set_epsilon(1 + 1e-3);
     checker.set_param(param).exec({{20, 64, 22, 33}, {}});
     param.mode = Param::Mode::AVERAGE;
-    checker.set_param(param).exec({{20, 64, 22, 33}, {}});
+    checker.set_param(param).exec({{20, 96, 22, 33}, {}});
     param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING;
-    checker.set_param(param).exec({{20, 64, 22, 33}, {}});
+    checker.set_param(param).exec({{20, 24, 22, 33}, {}});
 }
 
 TEST_F(CUDA, POOLING_FORWARD_NCHW4) {
diff --git a/dnn/test/cuda/type_cvt.cpp b/dnn/test/cuda/type_cvt.cpp
index 8567c9f75..5bdde41b9 100644
--- a/dnn/test/cuda/type_cvt.cpp
+++ b/dnn/test/cuda/type_cvt.cpp
@@ -107,7 +107,7 @@ TEST_F(CUDA, QUANTIZED_TYPECVT) {
 }
 
 TEST_F(CUDA, QUANTIZED_TYPECVT_4BIT) {
-    UniformIntRNG int_rng{0, 8};
+    UniformIntRNG int_rng{-8, 8};
     Checker<TypeCvt> checker(handle_cuda());
     checker.set_rng(0, &int_rng).set_rng(1, &int_rng);
 
diff --git a/dnn/test/cuda/warp_perspective.cpp b/dnn/test/cuda/warp_perspective.cpp
index 2d323a219..dd4627063 100644
--- a/dnn/test/cuda/warp_perspective.cpp
+++ b/dnn/test/cuda/warp_perspective.cpp
@@ -627,9 +627,9 @@ TEST_F(CUDA, WARP_PERSPECTIVE_FORWARD_QINT4) {
     Checker<WarpPerspectiveForward> checker(handle_cuda());
     WarpPerspectiveMatRNG rng;
     checker.set_rng(1, &rng);
-    checker.set_dtype(0, dtype::QuantizedS4(0.1f))
+    checker.set_dtype(0, dtype::QuantizedS4(1.25f))
             .set_dtype(1, dtype::Float32())
-            .set_dtype(2, dtype::QuantizedS4(0.1f));
+            .set_dtype(2, dtype::QuantizedS4(1.25f));
     for (auto bmode : {WarpPerspective::BorderMode::WRAP,
                        WarpPerspective::BorderMode::REFLECT,
                        WarpPerspective::BorderMode::REPLICATE,
diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp
index ef4500e8f..1b8a064e6 100644
--- a/src/gopt/impl/tensor_reformat.cpp
+++ b/src/gopt/impl/tensor_reformat.cpp
@@ -68,7 +68,7 @@ using namespace gopt;
  * oprs should not get involved in any actual computing.
  */
 MGB_DEFINE_OPR_CLASS(TensorReformatPass::RelayoutPlaceholder,
-                     cg::SingleCNOperatorNodeBase) // {
+                           cg::SingleCNOperatorNodeBase) // {
 public:
     //! relayout type of this opr
     enum class LayoutType {
@@ -124,14 +124,14 @@ public:
         NCHW4_TO_NCHW64,   //! <from nchw4 layout to nchw64 layout
     };
 
-    RelayoutPlaceholder(VarNode * src_var, LayoutType layout_type);
+    RelayoutPlaceholder(VarNode* src_var, LayoutType layout_type);
 
     /*!
      * \param src_var the input var
      * \param layout_type tensor layout transform type of this relayout
      * placeholder as described in LayoutType
      */
-    static SymbolVar make(VarNode * src_var, LayoutType layout_type);
+    static SymbolVar make(VarNode* src_var, LayoutType layout_type);
 
     LayoutType layout_type() const { return m_layout_type; }
 
@@ -141,7 +141,6 @@ private:
     void init_output_comp_node() override;
     const LayoutType m_layout_type;
 };
-
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(TensorReformatPass::RelayoutPlaceholder);
 
 TensorReformatPass::RelayoutPlaceholder::RelayoutPlaceholder(
@@ -3866,8 +3865,12 @@ void PaddingChannelPass::apply(OptState& opt) const {
     };
 
     auto extract_subtensor = [](VarNode* inp,
-                                size_t orig_channels) -> VarNode* {
+                                const TensorShape& orig_shape) -> VarNode* {
         mgb_assert(inp->shape().ndim == 4);
+        mgb_assert(inp->shape()[0] == orig_shape[0]);
+        mgb_assert(inp->shape()[2] == orig_shape[2]);
+        mgb_assert(inp->shape()[3] == orig_shape[3]);
+        size_t orig_channels = orig_shape[1];
         auto x = SymbolVar(inp);
         auto cv = [&x](int v) { return x.make_scalar(v); };
         using AIdx = opr::Subtensor::AxisIndexer;
@@ -4108,8 +4111,7 @@ void PaddingChannelPass::apply(OptState& opt) const {
                 bool padding_cur_inp =
                         padding_oprs.count(cur_inp->owner_opr()) > 0;
                 if (padding_cur_inp) {
-                    size_t orig_channels = cur_inp->shape()[1];
-                    inps[i] = extract_subtensor(inps[i], orig_channels);
+                    inps[i] = extract_subtensor(inps[i], cur_inp->shape());
                 }
             }
             return serialization::copy_opr_shallow(*opr, inps, opr->config());
@@ -4133,8 +4135,7 @@ void PaddingChannelPass::apply(OptState& opt) const {
             auto cur_inp = opr->input(i);
             bool padding_cur_inp = padding_oprs.count(cur_inp->owner_opr()) > 0;
             if (padding_cur_inp) {
-                size_t orig_channels = cur_inp->shape()[1];
-                inps[i] = extract_subtensor(inps[i], orig_channels);
+                inps[i] = extract_subtensor(inps[i], cur_inp->shape());
             }
         }
         return serialization::copy_opr_shallow(*opr, inps, opr->config());
@@ -4142,6 +4143,8 @@ void PaddingChannelPass::apply(OptState& opt) const {
     opr_replace_funcs[opr::Reshape::typeinfo()] = replace_nonpadding_oprs;
     opr_replace_funcs[opr::GetVarShape::typeinfo()] = replace_nonpadding_oprs;
     opr_replace_funcs[opr::Concat::typeinfo()] = replace_nonpadding_oprs;
+    opr_replace_funcs[opr::Reduce::typeinfo()] = replace_nonpadding_oprs;
+    opr_replace_funcs[opr::Subtensor::typeinfo()] = replace_nonpadding_oprs;
 
     auto on_opr = [&opt, &rewriter, &opr_replace_funcs,
                    &extract_subtensor](OperatorNodeBase* opr) {
@@ -4169,8 +4172,7 @@ void PaddingChannelPass::apply(OptState& opt) const {
                     auto dst = out1[i];
                     if (opt.graph().endpoint_contain(src) &&
                         !src->shape().eq_shape(dst->shape())) {
-                        size_t orig_channels = src->shape()[1];
-                        dst = extract_subtensor(dst, orig_channels);
+                        dst = extract_subtensor(dst, src->shape());
                     }
                     rewriter.replace_var(src, dst, nullptr);
                 }
diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp
index 2483a2eeb..a875f3ccf 100644
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -4183,14 +4183,7 @@ TEST(TestGoptInference, PaddingChannels) {
     REQUIRE_GPU(1);
     auto cn = CompNode::load("gpu0");
     cn.activate();
-    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
-    auto sm_ver = prop.major * 10 + prop.minor;
-    if (sm_ver < 61) {
-        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
-               "expected: %d)\n",
-               sm_ver, 61);
-        return;
-    }
+    REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
 
     HostTensorGenerator<dtype::Int8> gen;
     auto graph = ComputingGraph::make();
@@ -4263,15 +4256,8 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) {
     REQUIRE_GPU(1);
     auto cn = CompNode::load("gpu0");
     cn.activate();
-    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
-    auto sm_ver = prop.major * 10 + prop.minor;
-    if (sm_ver < 61) {
-        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
-               "expected: %d)\n",
-               sm_ver, 61);
-        return;
-    }
-
+    REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
+    
     HostTensorGenerator<dtype::Int8> gen;
     auto graph = ComputingGraph::make();
     graph->options().graph_opt_level = 0;
@@ -4332,19 +4318,11 @@ TEST(TestGoptInference, ConcatAfterPaddingChannels) {
     MGB_ASSERT_TENSOR_EQ(t1, t2);
 }
 
-// FIXME replace cpu with gpu to enable gpu validation
 TEST(TestGoptInference, PaddingChannelsWithPooling) {
     REQUIRE_GPU(1);
     auto cn = CompNode::load("gpu0");
     cn.activate();
-    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
-    auto sm_ver = prop.major * 10 + prop.minor;
-    if (sm_ver < 61) {
-        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
-               "expected: %d)\n",
-               sm_ver, 61);
-        return;
-    }
+    REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);
 
     HostTensorGenerator<dtype::Int8> gen;
     auto graph = ComputingGraph::make();
@@ -4408,17 +4386,7 @@ TEST(TestGoptInference, PaddingChannelsWithPooling) {
 
 // FIXME replace cpu with gpu to enable gpu validation
 TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
-    REQUIRE_GPU(1);
     auto cn = CompNode::load("cpu0");
-//    cn.activate();
-//    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
-//    auto sm_ver = prop.major * 10 + prop.minor;
-//    if (sm_ver < 61) {
-//        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
-//               "expected: %d)\n",
-//               sm_ver, 61);
-//        return;
-//    }
 
     HostTensorGenerator<dtype::Int8> gen;
     auto graph = ComputingGraph::make();
@@ -4488,16 +4456,9 @@ TEST(TestGoptInference, PaddingChannelsWithWarpPerspective) {
 
 TEST(TestGoptInference, EnableNCHW64Basic) {
     REQUIRE_GPU(1);
-    auto cn = CompNode::load("cpu0");
-//    cn.activate();
-//    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
-//    auto sm_ver = prop.major * 10 + prop.minor;
-//    if (sm_ver < 61) {
-//        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
-//               "expected: %d)\n",
-//               sm_ver, 61);
-//        return;
-//    }
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
 
     HostTensorGenerator<dtype::Int8> gen;
     auto graph = ComputingGraph::make();
@@ -4517,8 +4478,8 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
     };
 
     auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)),
-         w = mkcvar("w", {32, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
-         b = mkcvar("b", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f));
+         w = mkcvar("w", {16, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
+         b = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f));
     opr::ConvBias::Param param;
     param.format = opr::ConvBias::Param::Format::NCHW;
     param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
@@ -4527,7 +4488,7 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
 
     auto y = opr::ConvBias::make(x, w, b, param, {},
                                  OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
-    auto w1 = mkcvar("w1", {32, 32, 3, 3}, dtype::QuantizedS8(2.5f)),
+    auto w1 = mkcvar("w1", {32, 16, 3, 3}, dtype::QuantizedS8(2.5f)),
          b1 = mkcvar("b1", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f));
     auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
                             OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
@@ -4541,14 +4502,14 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
     auto y3 = opr::ConvBias::make(y2, w3, b3, param, {},
                                   OperatorNodeConfig{dtype::QuantizedS4{40.f}});
     y3 = opr::TypeCvt::make(y3, dtype::QuantizedS8{2.5f});
-    auto w4 = mkcvar("w4", {32, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
-         b4 = mkcvar("b4", {1, 32, 1, 1}, dtype::QuantizedS32(6.25f));
+    auto w4 = mkcvar("w4", {16, 64, 3, 3}, dtype::QuantizedS8(2.5f)),
+         b4 = mkcvar("b4", {1, 16, 1, 1}, dtype::QuantizedS32(6.25f));
     auto y4 = opr::ConvBias::make(y3, w4, b4, param, {},
                                   OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
     using ElemMultiMode = opr::ElemwiseMultiType::Param::Mode;
     auto y5 = opr::ElemwiseMultiType::make(
             {y, y4}, {ElemMultiMode::QFUSE_ADD_RELU},
-            OperatorNodeConfig{dtype::QuantizedS8{1.2f}});
+            OperatorNodeConfig{dtype::QuantizedS8{1.3f}});
     y5 = opr::TypeCvt::make(y5, dtype::Float32());
     SymbolVar y5_pad;
     unpack_vector(
@@ -4573,10 +4534,10 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
         ASSERT_EQ(o.param().format, Format::_fmt);             \
     }
     CHECK(0, NCHW4);
-    CHECK(1, NCHW32);
+    CHECK(1, NCHW4);
     CHECK(2, NCHW32);
     CHECK(3, NCHW64);
-    CHECK(4, NCHW32);
+    CHECK(4, NCHW4);
 #undef CHECK
     HostTensorND t1, t2;
     auto func1 = graph->compile({make_callback_copy(y5, t1)});
@@ -4588,16 +4549,9 @@ TEST(TestGoptInference, EnableNCHW64Basic) {
 
 TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
     REQUIRE_GPU(1);
-    auto cn = CompNode::load("cpu0");
-//    cn.activate();
-//    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
-//    auto sm_ver = prop.major * 10 + prop.minor;
-//    if (sm_ver < 61) {
-//        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
-//               "expected: %d)\n",
-//               sm_ver, 61);
-//        return;
-//    }
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
 
     HostTensorGenerator<dtype::Int8> gen;
     auto graph = ComputingGraph::make();
@@ -4616,8 +4570,8 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
                 dtype);
     };
 
-    auto x = mkvar("x", {16, 3, 14, 14}, dtype::QuantizedS8(2.5f)),
-         w = mkcvar("w", {20, 3, 3, 3}, dtype::QuantizedS8(2.5f)),
+    auto x = mkvar("x", {16, 4, 14, 14}, dtype::QuantizedS8(2.5f)),
+         w = mkcvar("w", {20, 4, 3, 3}, dtype::QuantizedS8(2.5f)),
          b = mkcvar("b", {1, 20, 1, 1}, dtype::QuantizedS32(6.25f));
     opr::ConvBias::Param param;
     param.format = opr::ConvBias::Param::Format::NCHW;
@@ -4630,7 +4584,7 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
     opr::Pooling::Param pool;
     pool.format = opr::Pooling::Param::Format::NCHW;
     y = opr::Pooling::make(y, pool);
-    
+
     auto w1 = mkcvar("w1", {24, 20, 3, 3}, dtype::QuantizedS8(2.5f)),
          b1 = mkcvar("b1", {1, 24, 1, 1}, dtype::QuantizedS32(6.25f));
     auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
@@ -4657,11 +4611,12 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
     deconv.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
     deconv.stride_h = deconv.stride_w = 2;
     deconv.pad_h = deconv.pad_w = 1;
-    auto w6 = mkcvar("w6", {20, 20, 4, 4}, dtype::QuantizedS8{2.5f});
+    auto w6 = mkcvar("w6", {20, 64, 4, 4}, dtype::QuantizedS8{2.5f});
     auto y6 = opr::ConvolutionBackwardData::make(
             w6, y5, deconv, {},
             OperatorNodeConfig{dtype::QuantizedS8(2.0f)});
-    
+    y6 = opr::TypeCvt::make(y6, dtype::QuantizedS4{32.f});
+
     std::shared_ptr<HostTensorND> mat = std::make_shared<HostTensorND>(
             cn, TensorShape{16, 3, 3}, dtype::Float32());
     warp_perspective_mat_gen(*mat, 16, 14, 14);
@@ -4676,25 +4631,31 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
     opt.enable_nchw64();
     unpack_vector(gopt::optimize_for_inference({y7}, opt), y7_pad);
     EXPECT_TRUE(y7.node()->shape().eq_shape(y7_pad.node()->shape()));
-    SmallVector<cg::OperatorNodeBase*> oprs;
-    auto cb = [&oprs](cg::OperatorNodeBase* opr) {
-        if (opr->same_type<opr::ConvBias>()) {
-            oprs.push_back(opr);
-        }
-    };
-    cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr());
-    ASSERT_EQ(oprs.size(), 5);
+    HostTensorND t1, t2;
+    auto func1 = graph->compile({make_callback_copy(y7, t1)});
+    func1->execute();
+    auto func2 = graph->compile({make_callback_copy(y7_pad, t2)});
+    func2->execute();
+    MGB_ASSERT_TENSOR_EQ(t1, t2);
     using Format = opr::ConvBiasForward::Param::Format;
+        SmallVector<cg::OperatorNodeBase*> oprs;
+        auto cb = [&oprs](cg::OperatorNodeBase* opr) {
+            if (opr->same_type<opr::ConvBias>()) {
+                oprs.push_back(opr);
+            }
+        };
+        cg::DepOprIter{cb}.add(y7_pad.node()->owner_opr());
+        ASSERT_EQ(oprs.size(), 5);
 #define CHECK(_i, _fmt)                                        \
     {                                                          \
         const auto& o = oprs[_i]->cast_final<opr::ConvBias>(); \
         ASSERT_EQ(o.param().format, Format::_fmt);             \
     }
-    CHECK(0, NCHW4);
-    CHECK(1, NCHW32);
-    CHECK(2, NCHW32);
-    CHECK(3, NCHW64);
-    CHECK(4, NCHW64);
+        CHECK(0, NCHW4);
+        CHECK(1, NCHW32);
+        CHECK(2, NCHW32);
+        CHECK(3, NCHW64);
+        CHECK(4, NCHW64);
 #undef CHECK
     {
         const auto& deconv = find_opr<opr::ConvolutionBackwardData>(y7_pad);
@@ -4702,30 +4663,19 @@ TEST(TestGoptInference, EnableNCHW64PaddingChannel) {
         const auto& pool = find_opr<opr::PoolingForward>(y7_pad);
         ASSERT_EQ(pool.param().format, Format::NCHW4);
         const auto& warp = find_opr<opr::WarpPerspectiveForward>(y7_pad);
-        ASSERT_EQ(warp.param().format, Format::NCHW4);
+        ASSERT_EQ(warp.param().format, Format::NCHW64);
     }
     size_t nr_dimshuffle = find_opr_num<opr::Dimshuffle>(y7_pad);
-    HostTensorND t1, t2;
-    auto func1 = graph->compile({make_callback_copy(y7, t1)});
-    func1->execute();
-    auto func2 = graph->compile({make_callback_copy(y7_pad, t2)});
-    func2->execute();
-    MGB_ASSERT_TENSOR_EQ(t1, t2);
+    ASSERT_EQ(nr_dimshuffle, 8);
 }
 
 TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
     REQUIRE_GPU(1);
-    auto cn = CompNode::load("cpu0");
-//    cn.activate();
-//    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
-//    auto sm_ver = prop.major * 10 + prop.minor;
-//    if (sm_ver < 61) {
-//        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
-//               "expected: %d)\n",
-//               sm_ver, 61);
-//        return;
-//    }
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
 
+    HostTensorND t1, t2;
     HostTensorGenerator<dtype::Int8> gen;
     auto graph = ComputingGraph::make();
     graph->options().graph_opt_level = 0;
@@ -4757,7 +4707,7 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
     auto w1 = mkcvar("w1", {64, 32, 3, 3}, dtype::QuantizedS8(2.5f)),
          b1 = mkcvar("b1", {1, 64, 1, 1}, dtype::QuantizedS32(6.25f));
     auto y1 = opr::ConvBias::make(y, w1, b1, param, {},
-                            OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
+                                  OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
     y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f});
     auto w2 = mkcvar("w2", {64, 64, 3, 3}, dtype::QuantizedS4(2.5f)),
          b2 = mkcvar("b2", {1, 64, 1, 1}, dtype::QuantizedS32(100.f));
@@ -4772,6 +4722,9 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
             {y1, y3}, {ElemMultiMode::QFUSE_ADD_RELU},
             OperatorNodeConfig{dtype::QuantizedS4{40.f}});
     y4 = opr::TypeCvt::make(y4, dtype::Float32());
+    auto y5 = opr::ConvBias::make(y2, w3, b3, y1, param, {},
+                                  OperatorNodeConfig{dtype::QuantizedS4(40.f)});
+    y5 = opr::TypeCvt::make(y5, dtype::Float32());
     SymbolVar y4_pad;
     auto opt = gopt::OptimizeForInferenceOptions{};
     opt.enable_nchw64();
@@ -4779,15 +4732,35 @@ TEST(TestGoptInference, EnableNCHW64FuseConvBiasZ) {
     EXPECT_TRUE(y4.node()->shape().eq_shape(y4_pad.node()->shape()));
     size_t nr_elem_mult_type = find_opr_num<opr::ElemwiseMultiType>(y4_pad);
     ASSERT_EQ(nr_elem_mult_type, 0);
-    // FIXME need impl of elemwise/elemwise_multi_type on CUDA
-#if 0
-    HostTensorND t1, t2;
-    auto func1 = graph->compile({make_callback_copy(y4, t1)});
-    func1->execute();
-    auto func2 = graph->compile({make_callback_copy(y4_pad, t2)});
-    func2->execute();
+    auto func = graph->compile({make_callback_copy(y4_pad, t1)});
+    func->execute();
+
+    {
+        opr::ConvBias::Param param;
+        param.format = opr::ConvBias::Param::Format::NCHW;
+        param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
+        param.stride_h = param.stride_w = 1;
+        param.pad_h = param.pad_w = 1;
+
+        auto y = opr::ConvBias::make(
+                x, w, b, param, {},
+                OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
+        auto y1 = opr::ConvBias::make(
+                y, w1, b1, param, {},
+                OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
+        y1 = opr::TypeCvt::make(y1, dtype::QuantizedS4{40.f});
+        auto y2 = opr::ConvBias::make(
+                y1, w2, b2, param, {},
+                OperatorNodeConfig{dtype::QuantizedS4{40.f}});
+        param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
+        auto y3 = opr::ConvBias::make(
+                y2, w3, b3, y1, param, {},
+                OperatorNodeConfig{dtype::QuantizedS4(40.f)});
+        y3 = opr::TypeCvt::make(y3, dtype::Float32());
+        auto func = graph->compile({make_callback_copy(y3, t2)});
+        func->execute();
+    }
     MGB_ASSERT_TENSOR_EQ(t1, t2);
-#endif
 }
 
 #endif
diff --git a/src/opr/impl/imgproc.cpp b/src/opr/impl/imgproc.cpp
index 1d424b407..eedc3e537 100644
--- a/src/opr/impl/imgproc.cpp
+++ b/src/opr/impl/imgproc.cpp
@@ -102,7 +102,8 @@ void WarpPerspectiveForward::outshape_by_symvar_do_get_output_shape(
         default:
             size_t height_idx = 0;
             if (param().format == Param::Format::NCHW ||
-                param().format == Param::Format::NCHW4) {
+                param().format == Param::Format::NCHW4 ||
+                param().format == Param::Format::NCHW64) {
                 height_idx = 2;
             } else {
                 height_idx = 1;
diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp
index 12e6b182f..b30aab099 100644
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -2604,11 +2604,21 @@ TEST_F(TestNoWeightPreprocess, NoPreprocess) {
 #endif
 
 namespace {
-// FIXME change comp node from "cpu0" to "gpu0"    
 TEST(TestOprDNN, ConvBiasInt4NCHW) {
-    auto run = [](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
-                  size_t P) {
-        auto cn = CompNode::load("cpu0");
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
+    auto sm_ver = prop.major * 10 + prop.minor;
+    if (sm_ver != 75) {
+        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
+               "expected: %d)\n",
+               sm_ver, 75);
+        return;
+    }
+
+    auto run = [&cn](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
+                     size_t P) {
         auto graph = ComputingGraph::make();
 
         HostTensorGenerator<dtype::Int8> gen;
@@ -2671,6 +2681,18 @@ TEST(TestOprDNN, ConvBiasInt4NCHW) {
 }
 
 TEST(TestOprDNN, ConvBiasInt4NCHW64) {
+    REQUIRE_GPU(1);
+    auto cn = CompNode::load("gpu0");
+    cn.activate();
+    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
+    auto sm_ver = prop.major * 10 + prop.minor;
+    if (sm_ver != 75) {
+        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
+               "expected: %d)\n",
+               sm_ver, 75);
+        return;
+    }
+
     auto nchw2nchw64 = [](SymbolVar x) {
         auto y = opr::RelayoutFormat::make(
                 x, opr::RelayoutFormat::Param::Mode::NCHW_NCHW64);
@@ -2685,7 +2707,6 @@ TEST(TestOprDNN, ConvBiasInt4NCHW64) {
 
     auto run = [&](size_t N, size_t C, size_t H, size_t W, size_t F, size_t S,
                    size_t P) {
-        auto cn = CompNode::load("cpu0");
         auto graph = ComputingGraph::make();
 
         HostTensorGenerator<dtype::Int8> gen;
diff --git a/src/plugin/impl/opr_footprint.cpp b/src/plugin/impl/opr_footprint.cpp
index cc29f84be..e5f71e502 100644
--- a/src/plugin/impl/opr_footprint.cpp
+++ b/src/plugin/impl/opr_footprint.cpp
@@ -138,15 +138,19 @@ uint64_t eval_conv_computation(const TensorShape& src_shape,
                                  src_shape[1] / group * 2;
             return hybird_nchwx ? computation : computation * 4;
         }
-        if (param.format == Param::Format::NCHW32 ||
-            param.format == Param::Format::NCHW32_NCHW4) {
-            return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 32 /
-                   group * 2;
+        size_t packed_size;
+        if (param.format == Param::Format::NCHW64) {
+            packed_size = 64;
+        } else if (param.format == Param::Format::NCHW32 ||
+                   param.format == Param::Format::NCHW32_NCHW4) {
+            packed_size = 32;
+        } else {
+            mgb_assert(param.format == Param::Format::NCHW4 ||
+                               param.format == Param::Format::NCHW4_NCHW ||
+                               param.format == Param::Format::NCHW4_NCHW32,
+                       "format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32");
+            packed_size = 4;
         }
-        mgb_assert(param.format == Param::Format::NCHW4 ||
-                           param.format == Param::Format::NCHW4_NCHW ||
-                           param.format == Param::Format::NCHW4_NCHW32,
-                   "format should be NCHW4/NCHW4_NCHW/NCHW4_NCHW32");
         return dst_shape.total_nr_elems() * fh * fw * src_shape[1] * 4 / group *
                2;
     };
diff --git a/test/src/helper.cpp b/test/src/helper.cpp
index ccdd2bbef..9a2a0f311 100644
--- a/test/src/helper.cpp
+++ b/test/src/helper.cpp
@@ -390,7 +390,37 @@ bool mgb::check_compute_capability(int major, int minor) {
     MGB_CUDA_CHECK(cudaGetDevice(&dev));
     cudaDeviceProp prop;
     MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev));
-    return prop.major > major || (prop.major == major && prop.minor >= minor);
+    bool available = prop.major > major || (prop.major == major && prop.minor >= minor);
+    if (!available) {
+        mgb_log_warn(
+                "This testcase is ignored due to insufficient cuda cap(got: "
+                "%d.%d, "
+                "expected: %d.%d)",
+                prop.major, prop.minor, major, minor);
+    }
+    return available;
+#else
+    MGB_MARK_USED_VAR(major);
+    MGB_MARK_USED_VAR(minor);
+    return false;
+#endif
+}
+
+bool mgb::check_compute_capability_eq(int major, int minor) {
+#if MGB_CUDA
+    int dev;
+    MGB_CUDA_CHECK(cudaGetDevice(&dev));
+    cudaDeviceProp prop;
+    MGB_CUDA_CHECK(cudaGetDeviceProperties(&prop, dev));
+    bool available = prop.major == major && prop.minor == minor;
+    if (!available) {
+        mgb_log_warn(
+                "This testcase is ignored due to insufficient cuda cap(got: "
+                "%d.%d, "
+                "expected: %d.%d)",
+                prop.major, prop.minor, major, minor);
+    }
+    return available;
 #else
     MGB_MARK_USED_VAR(major);
     MGB_MARK_USED_VAR(minor);
diff --git a/test/src/include/megbrain/test/helper.h b/test/src/include/megbrain/test/helper.h
index 5bfee7e59..fc566033c 100644
--- a/test/src/include/megbrain/test/helper.h
+++ b/test/src/include/megbrain/test/helper.h
@@ -504,6 +504,9 @@ bool check_cambricon_device_available(size_t num);
 //! check current capability >= major.minor
 bool check_compute_capability(int major, int minor);
 
+//! check current capability == major.minor
+bool check_compute_capability_eq(int major, int minor);
+
 //! check compnode avaiable
 bool check_device_type_avaiable(CompNode::DeviceType device_type);
 
@@ -540,6 +543,12 @@ public:
             return;                                   \
     } while (0)
 
+#define REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(major, minor) \
+    do {                                                 \
+        if (!check_compute_capability_eq(major, minor))  \
+            return;                                      \
+    } while (0)
+
 //! skip a testcase if amd gpu not available
 #define REQUIRE_AMD_GPU(n) do { \
     if (!check_amd_gpu_available(n)) \
-- 
GitLab