feat(dnn): add cuda preprocess fusion

GitOrigin-RevId: d789c99e59ce713a075061aacf6acdba78af43d3

feat(dnn): add cuda preprocess fusion
GitOrigin-RevId: d789c99e59ce713a075061aacf6acdba78af43d3
3bf73ff1 · Megvii Engine Team · 86cf7490 · 3bf73ff1 · 3bf73ff1 · 3bf73ff1
21 changed file
--- a/dnn/include/megdnn/dtype.h
+++ b/dnn/include/megdnn/dtype.h
@@ -201,6 +201,8 @@ class dt_quint8 {
 #endif
        bool operator<(const dt_quint8& b) const { return _ < b._; }
        bool operator>(const dt_quint8& b) const { return _ > b._; }
+        bool operator==(const dt_quint8& b) const { return _ == b._; }
+        bool operator!=(const dt_quint8& b) const { return _ != b._; }
 } MEGDNN_PACKED;

 class dt_qint32 {
@@ -255,6 +257,8 @@ class dt_qint8 {
 #endif
        bool operator<(const dt_qint8& b) const { return _ < b._; }
        bool operator>(const dt_qint8& b) const { return _ > b._; }
+        bool operator==(const dt_qint8& b) const { return _ == b._; }
+        bool operator!=(const dt_qint8& b) const { return _ != b._; }
 } MEGDNN_PACKED;

 class dt_qint16 {

--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -877,6 +877,7 @@ when the ``I`` suffix is present.
     'NCHW88_NCHW',
     'NCHW_NCHW4_IC_SMALL',
     'NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT',
+     'NCHW_NCHW4',
     )
 )


--- a/dnn/src/common/relayout.cpp
+++ b/dnn/src/common/relayout.cpp
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */

 #include "megdnn/oprs.h"
@@ -94,7 +95,9 @@ void RelayoutForward::check_layout_and_canonize(TensorLayout& src,
    src = src.collapse_contiguous();
    dst = dst.collapse_contiguous();
    megdnn_assert(src.dtype == dst.dtype &&
-                  src.total_nr_elems() == dst.total_nr_elems());
+                          src.total_nr_elems() == dst.total_nr_elems(),
+                  "check %s == %s and %zu == %zu", src.dtype.name(),
+                  dst.dtype.name(), src.total_nr_elems(), dst.total_nr_elems());
 }

 bool relayout::is_transpose(const TensorLayout& src, const TensorLayout& dst,

--- a/dnn/src/common/relayout_format.cpp
+++ b/dnn/src/common/relayout_format.cpp
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */

 #include "megdnn/oprs.h"
@@ -207,6 +208,15 @@ void RelayoutFormat::deduce_layout_fwd(const TensorLayout& src,
            dst[3] = src[2];
            dst[4] = src[4];
            break;
+        case Param::Mode::NCHW_NCHW4:
+            megdnn_assert(src.ndim == 4);
+            dst.ndim = 5;
+            dst[0] = src[0];
+            dst[1] = div_ceil<size_t>(src[1], 4);
+            dst[2] = src[2];
+            dst[3] = src[3];
+            dst[4] = 4;
+            break;
        default:
            megdnn_assert(0, "Invalid RelayoutFormat Mode");
            break;
@@ -214,7 +224,9 @@ void RelayoutFormat::deduce_layout_fwd(const TensorLayout& src,
    TensorFormat dst_fmt;
    deduce_format(src.format, dst_fmt);
    dst.format = dst_fmt;
-    dst.dtype = src.dtype;
+    if (!dst.dtype.valid()) {
+        dst.dtype = src.dtype;
+    }
    dst.init_contiguous_stride();
 }

@@ -245,6 +257,10 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
            CHECK_SRC(DefaultTensorFormat::make());
            dst = src;
            break;
+        case Param::Mode::NCHW_NCHW4:
+            CHECK_SRC(DefaultTensorFormat::make());
+            dst = src;
+            break;
        case Param::Mode::NCHW_NHWCD4I:
            CHECK_SRC(DefaultTensorFormat::make());
            dst = Image2DPack4TensorFormat::make_raw(2, align);
@@ -322,6 +338,7 @@ void RelayoutFormat::deduce_format(TensorFormat src, TensorFormat& dst) {
 void RelayoutFormat::check_layout_fwd(const TensorLayout& src,
                                      const TensorLayout& dst) {
    TensorLayout dst_expected;
+    dst_expected.dtype = dst.dtype;
    deduce_layout_fwd(src, dst_expected);
    megdnn_assert_eq_layout(dst_expected, dst);
 }
@@ -354,6 +371,19 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src,
                exec_dst = dst;
            }
            break;
+        case Param::Mode::NCHW_NCHW4:
+            // nchw to nchw4
+            {
+                TensorLayout work_space_layout(
+                        {src[0], round_up(src[1], 4_z), src[2], src[3]},
+                        src.dtype, src.format);
+                exec_src = work_space_layout
+                                   .reshape({src[0], div_ceil(src[1], 4_z), 4,
+                                             src[2], src[3]})
+                                   .dimshuffle({0, 1, 3, 4, 2});
+                exec_dst = dst;
+            }
+            break;
        case Param::Mode::NCHW88_NCHW:
            // nchw8c to nchw
            exec_src = src;
@@ -422,7 +452,6 @@ void RelayoutFormat::deduce_exec_layout(const TensorLayout& src,
            }
            break;

-
        case Param::Mode::NCHW_NHWCD4:
        case Param::Mode::NCHW_NHWCD4I:
            // src is {N, C, H, W}

--- a/dnn/src/cuda/relayout_format/opr_impl.cpp
+++ b/dnn/src/cuda/relayout_format/opr_impl.cpp
@@ -6,11 +6,13 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */

-#include "src/cuda/relayout_format/opr_impl.h"
 #include "src/cuda/handle.h"
+#include "src/cuda/relayout_format/opr_impl.h"
+#include "src/cuda/relayout_format/relayout_format.h"
 #include "src/cuda/utils.h"

 using namespace megdnn;
@@ -21,6 +23,7 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
    auto src_dtype = src.layout.dtype;
    megdnn_assert(
            param().mode == param::RelayoutFormat::Mode::NCHW4_CHWN4 ||
+                    param().mode == param::RelayoutFormat::Mode::NCHW_NCHW4 ||
                    param().mode == param::RelayoutFormat::Mode::CHWN4_NCHW4 ||
                    param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL ||
                    param().mode ==
@@ -72,12 +75,25 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
        return handle()->create_operator<RelayoutForward>()->exec(
                {src.raw_ptr, exec_src_layout}, {dst.raw_ptr, exec_dst_layout});
    }
-    TensorLayout exec_src, exec_dst;
-    deduce_exec_layout(src.layout, dst.layout, exec_src, exec_dst);
-    TensorND exec_src_nd{src.raw_ptr, exec_src};
-    TensorND exec_dst_nd{dst.raw_ptr, exec_dst};
-    handle()->create_operator<RelayoutForward>()->exec(exec_src_nd,
-                                                       exec_dst_nd);
+
+    if (param().mode == Param::Mode::NCHW_NCHW4) {
+        bool is_usable = relayout_format::RelayoutFormatFast::usable(
+                src.layout, dst.layout);
+        megdnn_assert(is_usable,
+                      "RelayoutFormatNCHW_NCHW4 kernel not usable for %s(%s) "
+                      "to %s(%s)",
+                      src.layout.to_string().c_str(), src.layout.dtype.name(),
+                      dst.layout.to_string().c_str(), dst.layout.dtype.name());
+        relayout_format::RelayoutFormatFast::exec(src, dst,
+                                                  cuda_stream(this->handle()));
+    } else {
+        TensorLayout exec_src, exec_dst;
+        deduce_exec_layout(src.layout, dst.layout, exec_src, exec_dst);
+        TensorND exec_src_nd{src.raw_ptr, exec_src};
+        TensorND exec_dst_nd{dst.raw_ptr, exec_dst};
+        handle()->create_operator<RelayoutForward>()->exec(exec_src_nd,
+                                                           exec_dst_nd);
+    }
 }

 size_t RelayoutFormatImpl::get_workspace_in_bytes(

--- a/dnn/src/cuda/relayout_format/relayout_format.cpp
+++ b/dnn/src/cuda/relayout_format/relayout_format.cpp
+/**
+ * \file dnn/src/cuda/relayout_format/relayout_format.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "src/cuda/relayout_format/relayout_format.cuh"
+#include "src/cuda/relayout_format/relayout_format.h"
+using namespace megdnn;
+using namespace cuda;
+
+namespace {
+
+inline void get_scale_zeropoint(const DType& tensor_dtype, float& scale,
+                                uint8_t& zero_point) {
+    if (tensor_dtype.enumv() == DTypeEnum::Quantized8Asymm) {
+        zero_point = tensor_dtype.param<dtype::Quantized8Asymm>().zero_point;
+        scale = tensor_dtype.param<dtype::Quantized8Asymm>().scale;
+    } else if (tensor_dtype.enumv() == DTypeEnum::QuantizedS8) {
+        scale = tensor_dtype.param<dtype::QuantizedS8>().scale;
+    }
+}
+
+}  // namespace
+
+bool relayout_format::RelayoutFormatFast::usable(
+        const TensorLayout& src_layout, const TensorLayout& dst_layout) {
+    return relayout_format_cuda_usable(src_layout, dst_layout);
+}
+
+void relayout_format::RelayoutFormatFast::exec(const TensorND& src,
+                                               const TensorND& dst,
+                                               cudaStream_t stream) {
+    size_t ih = src.layout[2];
+    size_t iw = src.layout[3];
+    size_t hw = ih * iw;
+    float src_scale = 1.f;
+    float dst_scale = 1.f;
+    uint8_t src_zero_point = 0;
+    uint8_t dst_zero_point = 0;
+    get_scale_zeropoint(src.layout.dtype, src_scale, src_zero_point);
+    get_scale_zeropoint(dst.layout.dtype, dst_scale, dst_zero_point);
+    if (src.layout.dtype.enumv() == DTypeEnum::Uint8) {
+        src_zero_point = 128;
+    }
+    if (hw % 4 == 0) {
+        relayout_format_cuda_exec<4>(src, dst, stream, src_scale, dst_scale,
+                                     src_zero_point, dst_zero_point);
+    } else {
+        relayout_format_cuda_exec<1>(src, dst, stream, src_scale, dst_scale,
+                                     src_zero_point, dst_zero_point);
+    }
+}
--- a/dnn/src/cuda/relayout_format/relayout_format.cu
+++ b/dnn/src/cuda/relayout_format/relayout_format.cu
+/**
+ * \file dnn/src/cuda/relayout_format/relayout_format.cu
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "src/cuda/query_blocksize.cuh"
+#include "src/cuda/relayout_format/relayout_format.cuh"
+using namespace megdnn;
+using namespace cuda;
+
+namespace {
+
+template <typename SrcType, typename DstType, bool same_scale>
+struct CudaPostProcess;
+
+template <>
+struct CudaPostProcess<dtype::Uint8, dtype::QuantizedS8, true> {
+    CudaPostProcess(float, uint8_t, float, uint8_t){};
+    inline __device__ int8_t operator()(uint8_t val) { return val - 128; }
+};
+
+template <>
+struct CudaPostProcess<dtype::Uint8, dtype::QuantizedS8, false> {
+    CudaDTypeParamImpl<dt_qint8> m_dst_type_cvt;
+    CudaPostProcess(float, uint8_t, float dst_scale, uint8_t) {
+        m_dst_type_cvt = CudaDTypeParamImpl<dt_qint8>(dst_scale);
+    };
+    inline __device__ int8_t operator()(uint8_t val) {
+        return m_dst_type_cvt.quantize((float)val - 128.f).as_int8();
+    }
+};
+
+template <>
+struct CudaPostProcess<dtype::Quantized8Asymm, dtype::QuantizedS8, false> {
+    CudaDTypeParamImpl<dt_qint8> m_dst_type_cvt;
+    CudaDTypeParamImpl<dt_quint8> m_src_type_cvt;
+    CudaPostProcess(float src_scale, uint8_t src_zero_point, float dst_scale,
+                    uint8_t) {
+        m_dst_type_cvt = CudaDTypeParamImpl<dt_qint8>(dst_scale);
+        m_src_type_cvt =
+                CudaDTypeParamImpl<dt_quint8>(src_scale, src_zero_point);
+    };
+    inline __device__ int8_t operator()(uint8_t val) {
+        float med_var = m_src_type_cvt.dequantize(dt_quint8(val));
+        return m_dst_type_cvt.quantize(med_var).as_int8();
+    }
+};
+
+template <>
+struct CudaPostProcess<dtype::Quantized8Asymm, dtype::QuantizedS8, true> {
+    uint8_t m_src_zero_point = 0;
+    CudaPostProcess(float, uint8_t src_zero_point, float, uint8_t) {
+        m_src_zero_point = src_zero_point;
+    };
+    inline __device__ int8_t operator()(uint8_t val) {
+        return val - m_src_zero_point;
+    }
+};
+
+template <>
+struct CudaPostProcess<dtype::QuantizedS8, dtype::QuantizedS8, false> {
+    CudaDTypeParamImpl<dt_qint8> m_dst_type_cvt;
+    CudaDTypeParamImpl<dt_qint8> m_src_type_cvt;
+    CudaPostProcess(float src_scale, uint8_t, float dst_scale, uint8_t) {
+        m_dst_type_cvt = CudaDTypeParamImpl<dt_qint8>(dst_scale);
+        m_src_type_cvt = CudaDTypeParamImpl<dt_qint8>(src_scale);
+    };
+    inline __device__ int8_t operator()(int8_t val) {
+        float med_var = m_src_type_cvt.dequantize(dt_qint8(val));
+        return m_dst_type_cvt.quantize(med_var).as_int8();
+    }
+};
+
+template <>
+struct CudaPostProcess<dtype::QuantizedS8, dtype::QuantizedS8, true> {
+    CudaPostProcess(float, uint8_t, float, uint8_t){};
+    inline __device__ int8_t operator()(int8_t val) { return val; }
+};
+
+template <typename SrcType, int pack_w>
+struct DTypeRWHelper;
+template <>
+struct DTypeRWHelper<char, 1> {
+    using InnerDtype = char;
+    using DstDtype = char4;
+};
+
+template <>
+struct DTypeRWHelper<char, 4> {
+    using InnerDtype = char4;
+    using DstDtype = char4;
+};
+
+template <int pack_w, int pack_c, typename SrcType, typename DnnSrcType,
+          typename DnnDstType, bool same_scale>
+struct Translayout {
+    using InnerDtype = typename DTypeRWHelper<SrcType, pack_w>::InnerDtype;
+    using DstDtype = typename DTypeRWHelper<SrcType, pack_w>::DstDtype;
+    static inline __device__ void trans(DstDtype (&dst_width)[pack_w],
+                                        InnerDtype (&read_channel)[pack_c],
+                                        const char zero_point);
+};
+
+template <typename SrcType, typename DnnSrcType, typename DnnDstType,
+          bool same_scale>
+struct Translayout<1, 4, SrcType, DnnSrcType, DnnDstType, same_scale> {
+    using InnerDtype = typename DTypeRWHelper<SrcType, 1>::InnerDtype;
+    using DstDtype = typename DTypeRWHelper<SrcType, 1>::DstDtype;
+    static inline __device__ void trans(
+            DstDtype (&dst_width)[1], InnerDtype (&read_channel)[4],
+            CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process,
+            const char zero_point) {
+        dst_width[0].x = post_process(read_channel[0]);
+        dst_width[0].y = post_process(read_channel[1]);
+        dst_width[0].z = post_process(read_channel[2]);
+        dst_width[0].w = post_process(read_channel[3]);
+    }
+};
+
+template <typename SrcType, typename DnnSrcType, typename DnnDstType,
+          bool same_scale>
+struct Translayout<4, 4, SrcType, DnnSrcType, DnnDstType, same_scale> {
+    using InnerDtype = typename DTypeRWHelper<SrcType, 4>::InnerDtype;
+    using DstDtype = typename DTypeRWHelper<SrcType, 4>::DstDtype;
+    static inline __device__ void trans(
+            DstDtype (&dst_width)[4], InnerDtype (&read_channel)[4],
+            CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process,
+            const char zero_point) {
+        dst_width[0].x = post_process(read_channel[0].x);
+        dst_width[0].y = post_process(read_channel[1].x);
+        dst_width[0].z = post_process(read_channel[2].x);
+        dst_width[0].w = post_process(read_channel[3].x);
+
+        dst_width[1].x = post_process(read_channel[0].y);
+        dst_width[1].y = post_process(read_channel[1].y);
+        dst_width[1].z = post_process(read_channel[2].y);
+        dst_width[1].w = post_process(read_channel[3].y);
+
+        dst_width[2].x = post_process(read_channel[0].z);
+        dst_width[2].y = post_process(read_channel[1].z);
+        dst_width[2].z = post_process(read_channel[2].z);
+        dst_width[2].w = post_process(read_channel[3].z);
+
+        dst_width[3].x = post_process(read_channel[0].w);
+        dst_width[3].y = post_process(read_channel[1].w);
+        dst_width[3].z = post_process(read_channel[2].w);
+        dst_width[3].w = post_process(read_channel[3].w);
+    }
+};
+
+template <typename DstType>
+inline __device__ DstType make_zero_pad(const char zero_point) {
+    return zero_point;
+}
+
+template <>
+inline __device__ char4 make_zero_pad<char4>(const char zero_point) {
+    return {zero_point, zero_point, zero_point, zero_point};
+}
+
+template <typename DstDtype>
+inline __device__ void write_helper(DstDtype* ptr, DstDtype val) {
+    *ptr = val;
+}
+
+template <>
+inline __device__ void write_helper<char4>(char4* ptr, char4 val) {
+    int32_t* rel_ptr = (int32_t*)ptr;
+    *rel_ptr = *(int32_t*)(&val);
+}
+
+template <bool with_pad, int pack_w, int pack_c, bool same_scale,
+          typename SrcType, typename DstType, typename DnnSrcType,
+          typename DnnDstType>
+struct RelayoutKern {
+    using InnerDtype = typename DTypeRWHelper<SrcType, pack_w>::InnerDtype;
+    using DstDtype = typename DTypeRWHelper<SrcType, pack_w>::DstDtype;
+    static inline __device__ void write(DstType* dst_ptr,
+                                        char4 (&dst_width)[pack_w]) {
+        DstDtype* dst_inner_ptr = (DstDtype*)dst_ptr;
+#pragma unroll
+        for (int iw_idx = 0; iw_idx < pack_w; ++iw_idx) {
+            write_helper(dst_inner_ptr + iw_idx, dst_width[iw_idx]);
+        }
+    }
+
+    static inline __device__ void read(const SrcType* src_ptr,
+                                       InnerDtype (&read_channel)[pack_c],
+                                       const int ic_stride) {
+#pragma unroll
+        for (int ic_idx = 0; ic_idx < pack_c; ++ic_idx) {
+            read_channel[ic_idx] = *(InnerDtype*)(src_ptr + ic_idx * ic_stride);
+        }
+    }
+
+    static inline __device__ void read_with_pad(
+            const SrcType* src_ptr, InnerDtype (&read_channel)[pack_c],
+            const int ic_stride, const int remain_ic,
+            const InnerDtype zero_point) {
+#pragma unroll
+        for (int ic_idx = 0; ic_idx < pack_c; ++ic_idx) {
+            read_channel[ic_idx] =
+                    ic_idx < remain_ic
+                            ? *(InnerDtype*)(src_ptr + ic_idx * ic_stride)
+                            : zero_point;
+        }
+    }
+
+    static inline __device__ void core_relayout_kern(
+            const SrcType* src, DstType* dst, const int src_offset_base,
+            const int dst_offset_base, const int ic_offset, const int ic_stride,
+            const int remain_ic,
+            CudaPostProcess<DnnSrcType, DnnDstType, same_scale>& post_process,
+            const char zero_point) {
+        InnerDtype read_channel[pack_c];
+        if (with_pad) {
+            const InnerDtype zero_pad = make_zero_pad<InnerDtype>(zero_point);
+            read_with_pad(src + ic_offset + src_offset_base, read_channel,
+                          ic_stride, remain_ic, zero_pad);
+        } else {
+            read(src + ic_offset + src_offset_base, read_channel, ic_stride);
+        }
+        DstDtype dst_width[pack_w];
+        Translayout<pack_w, pack_c, SrcType, DnnSrcType, DnnDstType,
+                    same_scale>::trans(dst_width, read_channel, post_process,
+                                       zero_point);
+        write(dst + ic_offset + dst_offset_base, dst_width);
+    }
+};
+
+template <int pack_w, bool same_scale, typename SrcType, typename DstType,
+          typename DnnSrcType, typename DnnDstType>
+__global__ void kern_nchw_nchw4(
+        const SrcType* src, DstType* dst, int ic, int ihw, int n_stride_src,
+        int ic_stride, int n_stride_dst,
+        CudaPostProcess<DnnSrcType, DnnDstType, same_scale> post_process,
+        const char zero_point) {
+    constexpr int pack_c = 4;
+    const int n_idx = blockIdx.y;
+    const int ihw_block_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    const int ihw_offset = ihw_block_idx * pack_w;
+
+    if (ihw_offset < ihw) {
+        const int ic_block = ic / pack_c;
+        const int remain_ic = ic % pack_c;
+        const int src_offset_base = n_idx * n_stride_src + ihw_offset;
+        const int dst_offset_base = n_idx * n_stride_dst + ihw_offset * pack_c;
+
+        for (int ic_blk_idx = 0; ic_blk_idx < ic_block; ++ic_blk_idx) {
+            const int ic_offset = ic_blk_idx * pack_c * ic_stride;
+            RelayoutKern<false, pack_w, pack_c, same_scale, SrcType, DstType,
+                         DnnSrcType,
+                         DnnDstType>::core_relayout_kern(src, dst,
+                                                         src_offset_base,
+                                                         dst_offset_base,
+                                                         ic_offset, ic_stride,
+                                                         remain_ic,
+                                                         post_process,
+                                                         zero_point);
+        }
+
+        if (remain_ic > 0) {
+            const int ic_offset = ic_block * pack_c * ic_stride;
+            RelayoutKern<true, pack_w, pack_c, same_scale, SrcType, DstType,
+                         DnnSrcType,
+                         DnnDstType>::core_relayout_kern(src, dst,
+                                                         src_offset_base,
+                                                         dst_offset_base,
+                                                         ic_offset, ic_stride,
+                                                         remain_ic,
+                                                         post_process,
+                                                         zero_point);
+        }
+    }
+}
+
+}  // namespace
+
+template <int pack_w = 1>
+void relayout_format::relayout_format_cuda_exec(
+        const TensorND& src, const TensorND& dst, const cudaStream_t& stream,
+        const float src_scale, const float dst_scale,
+        const uint8_t src_zero_point, const uint8_t dst_zero_point) {
+    constexpr int pack_oc = 4;
+    const int n = src.layout[0];
+    const int c = src.layout[1];
+    const int h = src.layout[2];
+    const int w = src.layout[3];
+    const int hw = h * w;
+    const int oc_block = DIVUP(c, pack_oc);
+    const int n_stride_src = c * hw;
+    const int ic_stride = hw;
+    const int n_stride_dst = oc_block * pack_oc * h * w;
+
+    auto& src_layout = src.layout;
+    auto& dst_layout = dst.layout;
+    bool same_scale = src_scale == dst_scale;
+#define RUN_KERNEL(same_scale, SRC_TYPE, DST_TYPE, SRC_C_TYPE, DST_C_TYPE)     \
+    if (same_scale) {                                                          \
+        int nr_threads = query_blocksize_for_kernel(                           \
+                kern_nchw_nchw4<pack_w, true, SRC_C_TYPE, DST_C_TYPE,          \
+                                SRC_TYPE, DST_TYPE>);                          \
+        const dim3 block_dim(DIVUP(hw, nr_threads* pack_w), n);                \
+        const dim3 thread_dim(nr_threads);                                     \
+        kern_nchw_nchw4<pack_w, true><<<block_dim, thread_dim, 0, stream>>>(   \
+                (SRC_C_TYPE*)src.raw_ptr, (DST_C_TYPE*)dst.raw_ptr, c, hw,     \
+                n_stride_src, ic_stride, n_stride_dst,                         \
+                CudaPostProcess<SRC_TYPE, DST_TYPE, true>(                     \
+                        src_scale, src_zero_point, dst_scale, dst_zero_point), \
+                src_zero_point);                                               \
+    } else {                                                                   \
+        int nr_threads = query_blocksize_for_kernel(                           \
+                kern_nchw_nchw4<pack_w, false, SRC_C_TYPE, DST_C_TYPE,         \
+                                SRC_TYPE, DST_TYPE>);                          \
+        const dim3 block_dim(DIVUP(hw, nr_threads* pack_w), n);                \
+        const dim3 thread_dim(nr_threads);                                     \
+        kern_nchw_nchw4<pack_w, false><<<block_dim, thread_dim, 0, stream>>>(  \
+                (SRC_C_TYPE*)src.raw_ptr, (DST_C_TYPE*)dst.raw_ptr, c, hw,     \
+                n_stride_src, ic_stride, n_stride_dst,                         \
+                CudaPostProcess<SRC_TYPE, DST_TYPE, false>(                    \
+                        src_scale, src_zero_point, dst_scale, dst_zero_point), \
+                src_zero_point);                                               \
+    }
+
+    if (src_layout.dtype.enumv().ev == DTypeEnum::Ev::Uint8 &&
+        dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) {
+        RUN_KERNEL(same_scale, dtype::Uint8, dtype::QuantizedS8, char, char);
+    } else if (src_layout.dtype.enumv().ev == DTypeEnum::Ev::Quantized8Asymm &&
+               dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) {
+        RUN_KERNEL(same_scale, dtype::Quantized8Asymm, dtype::QuantizedS8, char,
+                   char);
+    } else if (src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8 &&
+               dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) {
+        RUN_KERNEL(same_scale, dtype::QuantizedS8, dtype::QuantizedS8, char,
+                   char);
+    } else {
+        megdnn_assert(0, "not support dtype %s %s", src_layout.dtype.name(),
+                      dst_layout.dtype.name());
+    }
+}
+
+bool relayout_format::relayout_format_cuda_usable(
+        const TensorLayout& src_layout, const TensorLayout& dst_layout) {
+    bool is_all_continue =
+            src_layout.is_contiguous() && dst_layout.is_contiguous();
+    bool is_all_int8 =
+            (src_layout.dtype.enumv().ev == DTypeEnum::Ev::Uint8 &&
+             dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) ||
+            (src_layout.dtype.enumv().ev == DTypeEnum::Ev::Quantized8Asymm &&
+             dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8) ||
+            (src_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8 &&
+             dst_layout.dtype.enumv().ev == DTypeEnum::Ev::QuantizedS8);
+    return is_all_continue && is_all_int8;
+}
+
+template void relayout_format::relayout_format_cuda_exec<1>(
+        const TensorND& src, const TensorND& dst, const cudaStream_t& stream,
+        const float src_scale, const float dst_scale,
+        const uint8_t src_zero_point, const uint8_t dst_zero_point);
+
+template void relayout_format::relayout_format_cuda_exec<4>(
+        const TensorND& src, const TensorND& dst, const cudaStream_t& stream,
+        const float src_scale, const float dst_scale,
+        const uint8_t src_zero_point, const uint8_t dst_zero_point);
--- a/dnn/src/cuda/relayout_format/relayout_format.cuh
+++ b/dnn/src/cuda/relayout_format/relayout_format.cuh
+/**
+ * \file dnn/src/cuda/relayout_format/relayout_format.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+
+#include "megdnn/basic_types.h"
+#include "src/cuda/utils.cuh"
+
+namespace megdnn {
+namespace cuda {
+namespace relayout_format {
+
+template <int pack_w = 1>
+void relayout_format_cuda_exec(const TensorND& src, const TensorND& dst,
+                               const cudaStream_t& stream,
+                               const float src_scale = 1.f,
+                               const float dst_scale = 1.f,
+                               const uint8_t src_zero_point = 0,
+                               const uint8_t dst_zero_point = 0);
+
+bool relayout_format_cuda_usable(const TensorLayout& src_layout,
+                                 const TensorLayout& dst_layout);
+
+}  // namespace relayout_format
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/dnn/src/cuda/relayout_format/relayout_format.h
+++ b/dnn/src/cuda/relayout_format/relayout_format.h
+/**
+ * \file dnn/src/cuda/relayout_format/relayout_format.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#pragma once
+
+#include "megdnn/basic_types.h"
+#include "src/cuda/utils.cuh"
+
+namespace megdnn {
+namespace cuda {
+namespace relayout_format {
+
+struct RelayoutFormatFast {
+    static bool usable(const TensorLayout& src_layout,
+                       const TensorLayout& dst_layout);
+    static void exec(const TensorND& src, const TensorND& dst,
+                     cudaStream_t stream);
+};
+
+}  // namespace relayout_format
+
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/dnn/src/naive/relayout_format/opr_impl.cpp
+++ b/dnn/src/naive/relayout_format/opr_impl.cpp
@@ -6,11 +6,12 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */

-#include "src/naive/relayout_format/opr_impl.h"
 #include "src/naive/handle.h"
+#include "src/naive/relayout_format/opr_impl.h"

 #include "megdnn/tensor_iter.h"

@@ -44,7 +45,7 @@ void padding_src_to_workspace(dtype* dptr, const dtype* sptr, size_t N,
 template <typename dtype>
 void padding_to_workspace(dtype* dptr, const dtype* sptr,
                          const TensorLayout& src_layout, const size_t pad_axis,
-                          const size_t align_size) {
+                          const size_t align_size, const int pad_val = 0) {
    megdnn_assert(pad_axis < src_layout.ndim);
    const size_t axis_dim = src_layout[pad_axis];
    const size_t axis_dim_padded = round_up(axis_dim, align_size);
@@ -64,14 +65,16 @@ void padding_to_workspace(dtype* dptr, const dtype* sptr,
                            sptr[src_inner_offset + inner_idx_offset];
                } else {
                    dptr[dst_outer_offset + inner_idx_offset] =
-                            static_cast<dtype>(0);
+                            static_cast<dtype>(pad_val);
                }
            }
        }
    }
 }
+
 void padding_to_workspace(_megdnn_tensor_out dst, _megdnn_tensor_in src,
-                          const size_t pad_axis, const size_t align_size) {
+                          const size_t pad_axis, const size_t align_size,
+                          DType exec_dst_dtype) {
    switch (src.layout.dtype.enumv()) {
 #define cb(name, ctype)                                               \
    case (DTypeEnum::name): {                                         \
@@ -84,8 +87,27 @@ void padding_to_workspace(_megdnn_tensor_out dst, _megdnn_tensor_in src,

        cb(Float32, dt_float32);
        cb(QuantizedS8, dt_qint8);
+
+        case (DTypeEnum::Quantized8Asymm): {
+            dt_quint8* sptr = src.compatible_ptr<dt_quint8>();
+            dt_quint8* dptr = dst.compatible_ptr<dt_quint8>();
+            padding_to_workspace<dt_quint8>(
+                    dptr, sptr, src.layout, pad_axis, align_size,
+                    src.layout.dtype.param<dtype::Quantized8Asymm>()
+                            .zero_point);
+            break;
+        }
+        case (DTypeEnum::Uint8): {
+            uint8_t* sptr = src.compatible_ptr<uint8_t>();
+            uint8_t* dptr = dst.compatible_ptr<uint8_t>();
+            uint8_t zero_point =
+                    exec_dst_dtype.enumv() == DTypeEnum::QuantizedS8 ? 128 : 0;
+            padding_to_workspace<uint8_t>(dptr, sptr, src.layout, pad_axis,
+                                          align_size, zero_point);
+            break;
+        }
        default:
-            megdnn_assert(0);
+            megdnn_assert(0, "not support dtype %s", src.layout.dtype.name());
 #undef cb
    }
 }
@@ -108,6 +130,57 @@ void padding_filter_to_workspace(dtype* dptr, const dtype* sptr, size_t OC,
        }
    }
 }
+
+void do_copy_diff_qu8_q8(const TensorND& dst, const TensorND& src) {
+    auto isrc =
+            tensor_iter_valonly<DTypeTrait<dtype::Quantized8Asymm>::ctype>(src)
+                    .begin();
+    auto idst = tensor_iter_valonly<DTypeTrait<dtype::QuantizedS8>::ctype>(dst)
+                        .begin();
+    auto src_dt_parm = src.layout.dtype.param<dtype::Quantized8Asymm>();
+    auto dst_dt_parm = dst.layout.dtype.param<dtype::QuantizedS8>();
+    for (size_t i = 0, it = dst.layout.total_nr_elems(); i < it; ++i) {
+        *idst = dst_dt_parm.quantize(src_dt_parm.dequantize(*isrc));
+        ++idst;
+        ++isrc;
+    }
+}
+
+void do_copy_diff_q8_q8(const TensorND& dst, const TensorND& src) {
+    auto isrc = tensor_iter_valonly<DTypeTrait<dtype::QuantizedS8>::ctype>(src)
+                        .begin();
+    auto idst = tensor_iter_valonly<DTypeTrait<dtype::QuantizedS8>::ctype>(dst)
+                        .begin();
+    auto src_dt_parm = src.layout.dtype.param<dtype::QuantizedS8>();
+    auto dst_dt_parm = dst.layout.dtype.param<dtype::QuantizedS8>();
+    for (size_t i = 0, it = dst.layout.total_nr_elems(); i < it; ++i) {
+        *idst = dst_dt_parm.quantize(src_dt_parm.dequantize(*isrc));
+        ++idst;
+        ++isrc;
+    }
+}
+
+void do_copy_diff_u8_q8(const TensorND& dst, const TensorND& src) {
+    auto isrc =
+            tensor_iter_valonly<DTypeTrait<dtype::Uint8>::ctype>(src).begin();
+    auto idst = tensor_iter_valonly<DTypeTrait<dtype::QuantizedS8>::ctype>(dst)
+                        .begin();
+    auto dst_dt_parm = dst.layout.dtype.param<dtype::QuantizedS8>();
+    for (size_t i = 0, it = dst.layout.total_nr_elems(); i < it; ++i) {
+        *idst = dst_dt_parm.quantize((float)(*isrc) - 128.f);
+        ++idst;
+        ++isrc;
+    }
+}
+
+void check_layout_and_canonize(TensorLayout& src, TensorLayout& dst) {
+    megdnn_assert(dst.is_non_overlapping_strong());
+    src = src.collapse_contiguous();
+    dst = dst.collapse_contiguous();
+    megdnn_assert(dst.dtype.valid() &&
+                  src.total_nr_elems() == dst.total_nr_elems());
+}
+
 }  // anonymous namespace

 size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src,
@@ -189,6 +262,13 @@ size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src,
            size_t w = src[3];
            return n * c * h * w * src.dtype.size();
        }
+        case Param::Mode::NCHW_NCHW4: {
+            size_t n = src[0];
+            size_t c = round_up(src[1], 4_z);
+            size_t h = src[2];
+            size_t w = src[3];
+            return n * c * h * w * src.dtype.size();
+        }
        case Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT: {
            megdnn_assert(src.ndim == 4, "src must be oihw, ndim == 5");
            if (src[1] % 4 == 0)
@@ -208,6 +288,8 @@ size_t RelayoutFormatImpl::get_workspace_in_bytes(const TensorLayout& src,
 void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
                              _megdnn_workspace workspace) {
    megdnn_assert(src.layout.dtype.category() == DTypeCategory::FLOAT ||
+                  (src.layout.dtype.enumv() == DTypeEnum::Uint8 &&
+                   dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) ||
                  src.layout.dtype.category() == DTypeCategory::QUANTIZED);
    check_exec(src.layout, dst.layout, workspace.size);
    HandleImpl* m_handle = static_cast<HandleImpl*>(handle());
@@ -284,7 +366,7 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
        size_t val = src.layout[_idx];                                     \
        if (val % _pack_size != 0) {                                       \
            padding_to_workspace({workspace.raw_ptr, exec_src}, src, _idx, \
-                                 _pack_size);                              \
+                                 _pack_size, exec_dst.dtype);              \
            exec_src_nd.raw_ptr = workspace.raw_ptr;                       \
        }                                                                  \
    }                                                                      \
@@ -301,11 +383,43 @@ void RelayoutFormatImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst,
        cb(2, 8, NCHW_NCHW88_CONV_GROUP_WEIGHT);
    } else if (param().mode == Param::Mode::NCHW_NCHW4_IC_SMALL) {
        cb(1, 4, NCHW_NCHW4_IC_SMALL);
+    } else if (param().mode == Param::Mode::NCHW_NCHW4) {
+        cb(1, 4, NCHW_NCHW4);
    } else if (param().mode ==
               Param::Mode::NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT) {
        cb(1, 4, NCHW_NCHW4_IC_SMALL_CONV_DENSE_WEIGHT);
    }
-    m_handle->relayout_opr()->exec(exec_src_nd, exec_dst_nd, handle());
+
+    if (src.layout.dtype.enumv() == DTypeEnum::Quantized8Asymm &&
+        dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
+        TensorND src0 = exec_src_nd, dst0 = exec_dst_nd;
+        check_layout_and_canonize(src0.layout, src0.layout);
+        auto func = [](const TensorND& dst, const TensorND& src) {
+            do_copy_diff_qu8_q8(dst, src);
+        };
+        MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0));
+        return;
+    } else if (src.layout.dtype.enumv() == DTypeEnum::Uint8 &&
+               dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
+        TensorND src0 = exec_src_nd, dst0 = exec_dst_nd;
+        check_layout_and_canonize(src0.layout, src0.layout);
+        auto func = [](const TensorND& dst, const TensorND& src) {
+            do_copy_diff_u8_q8(dst, src);
+        };
+        MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0));
+        return;
+    } else if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS8 &&
+               dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8) {
+        TensorND src0 = exec_src_nd, dst0 = exec_dst_nd;
+        check_layout_and_canonize(src0.layout, src0.layout);
+        auto func = [](const TensorND& dst, const TensorND& src) {
+            do_copy_diff_q8_q8(dst, src);
+        };
+        MEGDNN_DISPATCH_CPU_KERN_OPR(func(dst0, src0));
+        return;
+    } else {
+        m_handle->relayout_opr()->exec(exec_src_nd, exec_dst_nd, handle());
+    }
 #undef cb
 }


--- a/dnn/test/cuda/relayout_format.cpp
+++ b/dnn/test/cuda/relayout_format.cpp
@@ -6,10 +6,12 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */
 #include "megdnn/dtype.h"
 #include "megdnn/oprs.h"
+#include "test/common/benchmarker.h"
 #include "test/common/checker.h"
 #include "test/common/rng.h"
 #include "test/cuda/fixture.h"
@@ -24,6 +26,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT) {
    param.mode = param::RelayoutFormat::Mode::NCHW4_CHWN4;

    checker.set_dtype(0, dtype::QuantizedS8{0.1f})
+            .set_dtype(1, dtype::QuantizedS8{0.1f})
            .set_rng(0, &rng)
            .set_param(param)
            .execs({{22, 23, 24, 25, 4}, {}});
@@ -31,6 +34,164 @@ TEST_F(CUDA, RELAYOUT_FORMAT) {
    checker.execs({{22, 23, 24, 25, 4}, {}});
 }

+TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4) {
+    Checker<RelayoutFormat> checker(handle_cuda());
+    UniformIntRNG rng{0, 50};
+    param::RelayoutFormat param;
+    param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
+
+    for (size_t n : {1, 3}) {
+        for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
+            for (size_t h : {3, 7, 12, 16, 22, 59, 83}) {
+                for (size_t w : {3, 22, 63, 128, 256}) {
+                    checker.set_dtype(0, dtype::QuantizedS8{1.f})
+                            .set_dtype(1, dtype::QuantizedS8{1.f})
+                            .set_rng(0, &rng)
+                            .set_param(param)
+                            .execs({{n, c, h, w}, {}});
+
+                    checker.set_dtype(0, dtype::QuantizedS8{1.f})
+                            .set_dtype(1, dtype::QuantizedS8{2.f})
+                            .set_rng(0, &rng)
+                            .set_param(param)
+                            .execs({{n, c, h, w}, {}});
+                }
+            }
+        }
+    }
+
+    checker.set_dtype(0, dtype::QuantizedS8{1.f})
+            .set_dtype(1, dtype::QuantizedS8{1.f})
+            .set_rng(0, &rng)
+            .set_param(param)
+            .execs({{8, 3, 224, 224}, {}});
+
+    checker.set_dtype(0, dtype::QuantizedS8{1.f})
+            .set_dtype(1, dtype::QuantizedS8{1.f})
+            .set_rng(0, &rng)
+            .set_param(param)
+            .execs({{8, 3, 600, 600}, {}});
+
+    checker.set_dtype(0, dtype::QuantizedS8{1.f})
+            .set_dtype(1, dtype::QuantizedS8{1.f})
+            .set_rng(0, &rng)
+            .set_param(param)
+            .execs({{1, 6, 768, 1280}, {}});
+}
+
+TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_DEFAULT) {
+    Checker<RelayoutFormat> checker(handle_cuda());
+    UniformIntRNG rng{0, 50};
+    param::RelayoutFormat param;
+    param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
+    for (size_t n : {1, 3}) {
+        for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
+            for (size_t h : {3, 7, 12, 16, 59, 83}) {
+                for (size_t w : {3, 63, 128, 256}) {
+                    checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
+                            .set_dtype(1, dtype::QuantizedS8{1.f})
+                            .set_rng(0, &rng)
+                            .set_param(param)
+                            .execs({{n, c, h, w}, {}});
+                }
+            }
+        }
+    }
+}
+
+TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_U8) {
+    Checker<RelayoutFormat> checker(handle_cuda());
+    UniformIntRNG rng{0, 255};
+    param::RelayoutFormat param;
+    param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
+    for (size_t n : {1, 3}) {
+        for (size_t c : {1, 2, 3, 4, 8, 9, 11, 16}) {
+            for (size_t h : {3, 7, 12, 16, 59, 83}) {
+                for (size_t w : {3, 13, 3 * 4, 63 * 4, 128 * 4, 256 * 4}) {
+                    checker.set_dtype(0, dtype::Uint8())
+                            .set_dtype(1, dtype::QuantizedS8{1.f})
+                            .set_rng(0, &rng)
+                            .set_param(param)
+                            .execs({{n, c, h, w}, {}});
+
+                    checker.set_dtype(0, dtype::Quantized8Asymm{1.f, 128})
+                            .set_dtype(1, dtype::QuantizedS8{1.f})
+                            .set_rng(0, &rng)
+                            .set_param(param)
+                            .execs({{n, c, h, w}, {}});
+
+                    checker.set_dtype(0, dtype::Uint8())
+                            .set_dtype(1, dtype::QuantizedS8{2.5f})
+                            .set_rng(0, &rng)
+                            .set_param(param)
+                            .execs({{n, c, h, w}, {}});
+                }
+            }
+        }
+    }
+}
+
+TEST_F(CUDA, RELAYOUT_FORMAT_NCHW_NCHW4_IC_SMALL) {
+    Checker<RelayoutFormat> checker(handle_cuda());
+    UniformIntRNG rng{0, 50};
+    param::RelayoutFormat param;
+    param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
+
+    checker.set_dtype(0, dtype::QuantizedS8{1.f})
+            .set_dtype(1, dtype::QuantizedS8{1.f})
+            .set_rng(0, &rng)
+            .set_param(param)
+            .execs({{8, 3, 768, 1280}, {}});
+}
+
+#if MEGDNN_WITH_BENCHMARK
+TEST_F(CUDA, BENCHMARK_RELAYOUT_FORMAT) {
+    using Param = RelayoutFormat::Param;
+
+    auto run = [&](const TensorShapeArray& shapes, Param param,
+                   Param default_param) {
+        Benchmarker<RelayoutFormat> benchmarker(handle_cuda());
+        benchmarker.set_param(param);
+        benchmarker.set_dtype(0, dtype::QuantizedS8{1.f})
+                .set_dtype(1, dtype::QuantizedS8{1.f});
+
+        Benchmarker<RelayoutFormat> benchmarker_default(handle_cuda());
+        benchmarker_default.set_param(default_param);
+        benchmarker_default.set_dtype(0, dtype::QuantizedS8{1.f})
+                .set_dtype(1, dtype::QuantizedS8{1.f});
+        for (auto&& shape : shapes) {
+            double memaccess = (double(shape.total_nr_elems()) +
+                                double(shape[0]) * ((shape[1] + 3) / 4 * 4) *
+                                        shape[2] * shape[3]) *
+                               1e-6;
+            auto time_ms = benchmarker.execs({shape, {}});
+            if (shape[1] <= 4) {
+                auto time_default_ms = benchmarker_default.execs({shape, {}});
+                printf("execute %s, time %.4f ms, %.4f GB/s, default %.4f "
+                       "GB/s\n",
+                       shape.to_string().c_str(), time_ms, memaccess / time_ms,
+                       memaccess / time_default_ms);
+            } else {
+                printf("execute %s, time %.4f ms, %.4f GB/s\n",
+                       shape.to_string().c_str(), time_ms, memaccess / time_ms);
+            }
+        }
+    };
+
+    TensorShapeArray shapes = {
+            {8, 1, 768, 1280}, {8, 3, 768, 1280},  {8, 3, 224, 224},
+            {8, 4, 768, 1280}, {64, 3, 768, 1280},
+    };
+    {
+        Param param;
+        param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4;
+        Param default_param;
+        default_param.mode = param::RelayoutFormat::Mode::NCHW_NCHW4_IC_SMALL;
+        run(shapes, param, default_param);
+    }
+}
+#endif
+
 TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {
    Checker<RelayoutFormat> checker(handle_cuda());
    UniformIntRNG rng{-50, 50};
@@ -39,7 +200,7 @@ TEST_F(CUDA, RELAYOUT_FORMAT_NCHW4) {

    for (DType dtype :
         std::vector<DType>({dtype::QuantizedS8{0.1f}, dtype::Float32{}})) {
-        checker.set_dtype(0, dtype).set_rng(0, &rng);
+        checker.set_dtype(0, dtype).set_dtype(1, dtype).set_rng(0, &rng);

        checker.set_param(param).execs({{2, 4, 35, 36}, {}});
        checker.set_param(param).execs({{2, 3, 35, 36}, {}});

--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
@@ -219,7 +219,10 @@ R"__usage__(
    Execute operators with weight preprocess, which can optimize the operator execution time with
    algo of winograd, im2col ,etc., but it may consume more memory.
 )__usage__"
-
+R"__usage__(
+  --enable-fuse-preprocess
+    Fusion astype\pad_channel\dimshuffle and etc opr from h2d op
+)__usage__"
 ;

 struct DataParser {
@@ -1141,6 +1144,11 @@ Args Args::from_argv(int argc, char **argv) {
            graph_opt.graph_opt.enable_nchw44_dot();
            continue;
        }
+        if (!strcmp(argv[i], "--enable-fuse-preprocess")) {
+            mgb_log_warn("enable-fuse-preprocess optimization");
+            graph_opt.graph_opt.enable_fuse_preprocess();
+            continue;
+        }
        if (!strcmp(argv[i], "--enable-fuse-conv-bias-nonlinearity")) {
            mgb_log_warn("enable fuse-conv-bias-nonlinearity optimization");
            graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity();

--- a/src/core/include/megbrain/graph/cg.h
+++ b/src/core/include/megbrain/graph/cg.h
@@ -101,6 +101,8 @@ struct GraphCommonOptimizeOptions {
    //! memory, default disable now, when weight preprocess is enabled, the
    //! input shape should no change
    bool weight_preprocess = false;
+    //! fuse preprocess patten, like astype + pad_channel + dimshuffle
+    bool fuse_preprocess = false;
    enum LayoutTransform : uint32_t {
        DEFAULT,
        NCHW4,       ///< compute using NCHW4 tensor format
@@ -130,6 +132,7 @@ struct GraphCommonOptimizeOptions {
    SET(f16_io_comp);
    SET(fuse_conv_bias_nonlinearity);
    SET(fuse_conv_bias_with_z);
+    SET(fuse_preprocess);
    SET(weight_winograd_transform);
    SET(weight_preprocess);
 #undef SET

--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -724,6 +724,8 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
            options.disable_##_option(); \
        }                                \
    }
+    
+    cb(fuse_preprocess, {add_pass(FuseNCHW4Int8Preprocess::make());});
    cb(f16_io_comp, { add_pass(ConvertF32ToF16Pass::make(false)); });
    cb(f16_io_f32_comp, { add_pass(ConvertF32ToF16Pass::make(true)); });

@@ -761,6 +763,7 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_optimize_options(
        add_pass(EnableTensorCorePass::make_tensorcore_converter());
        add_pass<ShuffleShuffleRemovePass>();
        add_pass<RemoveRedundantTypeCvtPass>();
+        add_pass(FuseNCHW4Int8Preprocess::make());
    });
    cb(chwn4, {
        add_pass<FuseConvBiasNonlinPass>();

--- a/src/gopt/impl/fuse_nchw4_int8_preprocess.cpp
+++ b/src/gopt/impl/fuse_nchw4_int8_preprocess.cpp
+/**
+ * \file src/gopt/impl/fuse_nchw4_int8_preprocess.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "megbrain/gopt/inference.h"
+#include "megbrain/gopt/misc.h"
+#include "megbrain/graph/grad_impl.h"
+#include "megbrain/opr/cond.h"
+#include "megbrain/opr/io.h"
+#include "megbrain/opr/tensor_manip.h"
+#include "megbrain/opr/utility.h"
+#include "megbrain/serialization/opr_shallow_copy.h"
+#include "megbrain/serialization/serializer.h"
+
+using namespace mgb;
+using namespace gopt;
+namespace {
+#define RETURN_IF_FALSE(ok) \
+    {                       \
+        if (!ok)            \
+            return ok;      \
+    }
+
+struct SubGraphMatcher {
+    struct Node {
+        using CallBack = std::function<bool(OperatorNodeBase* opr)>;
+        Node(Typeinfo* in_op_type) : op_type(in_op_type){};
+        Node(Typeinfo* in_op_type, CallBack func)
+                : op_type(in_op_type), cbk(func){};
+        Node(Typeinfo* in_op_type, std::vector<Node> in_pre_node)
+                : op_type(in_op_type), pre_node(in_pre_node){};
+        Node(Typeinfo* in_op_type, std::vector<Node> in_pre_node, CallBack func)
+                : op_type(in_op_type), pre_node(in_pre_node), cbk(func){};
+
+        Typeinfo* op_type{nullptr};
+        std::vector<Node> pre_node;
+        //! cbk used to check param and gather args for creating fusion op
+        CallBack cbk;
+    };
+
+    bool match(Node& root, OperatorNodeBase* opr) {
+        if (opr == nullptr) {
+            return false;
+        }
+        //! match nullptr node always
+        if (root.op_type == nullptr || root.op_type == opr->dyn_typeinfo()) {
+            bool match_ok = true;
+            if (root.cbk)
+                match_ok &= root.cbk(opr);
+            RETURN_IF_FALSE(match_ok);
+            auto& inp = opr->input();
+            for (size_t node_idx = 0; node_idx < root.pre_node.size();
+                 ++node_idx) {
+                bool valid_node_idx = node_idx < inp.size();
+                RETURN_IF_FALSE(valid_node_idx);
+                match_ok &= match(root.pre_node[node_idx],
+                                  inp[node_idx]->owner_opr());
+                RETURN_IF_FALSE(match_ok);
+            }
+            return match_ok;
+        } else {
+            return false;
+        }
+    }
+};
+#undef RETURN_IF_FALSE
+
+struct SubGraphChecker {
+    using DepType = cg::OperatorNodeProp::DepType;
+    using ReaderType =
+            ThinHashMap<OperatorNodeBase*,
+                        SmallVector<std::pair<OperatorNodeBase*, DepType>>>;
+    SubGraphChecker() {}
+
+    bool check(ThinHashSet<OperatorNodeBase*> used_input,
+               OperatorNodeBase* start_opr, OperatorNodeBase* stop_opr,
+               ReaderType& readers, bool ignore_immutable = true) {
+        bool is_all_inp_used = check_all_inp_used(used_input, start_opr,
+                                                  stop_opr, ignore_immutable);
+        bool is_all_dep_inside =
+                check_all_dep_inside_node(start_opr, stop_opr, readers);
+        return is_all_inp_used && is_all_dep_inside;
+    }
+
+    bool check_all_inp_used(ThinHashSet<OperatorNodeBase*>& used_input,
+                            OperatorNodeBase* start_opr,
+                            OperatorNodeBase* stop_opr,
+                            bool ignore_immutable = true) {
+        ThinHashSet<OperatorNodeBase*> leaf_set;
+        get_leaf_node(start_opr, stop_opr, leaf_set);
+        for (auto in_opr : leaf_set) {
+            bool skip = in_opr->same_type<opr::ImmutableTensor>() &&
+                        ignore_immutable;
+            if (used_input.find(in_opr) == used_input.end() && !skip) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+    bool check_all_dep_inside_node(OperatorNodeBase* start_opr,
+                                   OperatorNodeBase* stop_opr,
+                                   ReaderType& readers) {
+        ThinHashSet<OperatorNodeBase*> mid_set;
+        get_mid_node(start_opr, start_opr, stop_opr, mid_set);
+        for (auto inner_opr : mid_set) {
+            if (readers.find(inner_opr) != readers.end()) {
+                for (auto& out_node : readers[inner_opr]) {
+                    if (mid_set.find(out_node.first) == mid_set.end() &&
+                        out_node.first != start_opr &&
+                        out_node.second ==
+                                cg::OperatorNodeProp::DepType::DEV_VALUE) {
+                        return false;
+                    }
+                }
+            }
+        }
+        return true;
+    }
+
+    void get_mid_node(OperatorNodeBase* opr, OperatorNodeBase* start_opr,
+                      OperatorNodeBase* stop_opr,
+                      ThinHashSet<OperatorNodeBase*>& mid_set) {
+        if (opr == nullptr) {
+            return;
+        }
+        if (opr != start_opr) {
+            mid_set.insert(opr);
+        }
+        if (opr == stop_opr) {
+            return;
+        }
+        for (auto& tensor : opr->input()) {
+            auto pre_opr = tensor->owner_opr();
+            get_mid_node(pre_opr, start_opr, stop_opr, mid_set);
+        }
+    }
+
+    void get_leaf_node(OperatorNodeBase* opr, OperatorNodeBase* stop_opr,
+                       ThinHashSet<OperatorNodeBase*>& leaf_set) {
+        if (opr == nullptr) {
+            return;
+        }
+        if (opr == stop_opr || opr->input().size() == 0) {
+            leaf_set.insert(opr);
+        }
+        if (opr == stop_opr) {
+            return;
+        }
+        for (auto& tensor : opr->input()) {
+            auto pre_opr = tensor->owner_opr();
+            get_leaf_node(pre_opr, stop_opr, leaf_set);
+        }
+    }
+};
+
+static inline bool is_shape_nchw(const TensorShape& shape) {
+    return shape.ndim == 4;
+}
+
+static inline bool is_shape_before_nchw4(const TensorShape& shape) {
+    return shape.ndim == 5 && shape[2] == 4;
+}
+
+static inline bool is_nchw_nchw4_shuffle_vec(
+        const opr::Dimshuffle::Param param) {
+    return param.ndim == 5 && param.pattern[0] == 0 && param.pattern[1] == 1 &&
+           param.pattern[2] == 3 && param.pattern[3] == 4 &&
+           param.pattern[4] == 2;
+}
+
+template <typename T>
+static inline bool is_immutable_equal(OperatorNodeBase* opr, T val,
+                                      DTypeEnum dtype_enum) {
+    auto const_opr = opr->try_cast_final<opr::ImmutableTensor>();
+    if (!const_opr) {
+        return false;
+    }
+    auto& host_value = const_opr->host_value();
+    bool ok_value = host_value.layout().total_nr_elems() == 1 &&
+                    host_value.dtype().enumv() == dtype_enum &&
+                    host_value.ptr<T>()[0] == val;
+    return ok_value;
+}
+
+template <typename T>
+static inline bool is_immutable_all_equal(OperatorNodeBase* opr,
+                                          typename DTypeTrait<T>::ctype val) {
+    auto const_opr = opr->try_cast_final<opr::ImmutableTensor>();
+    if (!const_opr) {
+        return false;
+    }
+    auto& host_value = const_opr->host_value();
+    bool ok_value = host_value.dtype().enumv() == DTypeTrait<T>::enumv;
+    if (!ok_value) {
+        return false;
+    }
+    size_t nr_elem = host_value.layout().total_nr_elems();
+    for (size_t i = 0; i < nr_elem; ++i) {
+        if (host_value.ptr<typename DTypeTrait<T>::ctype>()[i] != val) {
+            ok_value = false;
+            break;
+        }
+    }
+    return ok_value;
+}
+
+}  // namespace
+
+const char* FuseNCHW4Int8Preprocess::name() const {
+    return "fuse_pre_process_pass";
+}
+
+std::unique_ptr<FuseNCHW4Int8Preprocess> FuseNCHW4Int8Preprocess::make() {
+    using SGM = SubGraphMatcher;
+    auto gen_pad_dimshuffle_graph = [&](SGM::Node& in_node,
+                                        SGM::Node::CallBack& pad_cbk,
+                                        SGM::Node::CallBack& shape_cbk) {
+        SGM::Node::CallBack check_pad = [&](OperatorNodeBase* opr) {
+            SGM sub_matcher;
+            SGM::Node immu_node{opr::ImmutableTensor::typeinfo(), pad_cbk};
+            if (opr->same_type<opr::ImmutableTensor>()) {
+                return sub_matcher.match(immu_node, opr);
+            } else if (opr->same_type<opr::Broadcast>()) {
+                return sub_matcher.match(immu_node,
+                                         opr->input()[0]->owner_opr());
+            } else {
+                return false;
+            }
+        };
+        SGM::Node broadcast_or_immutable{nullptr, check_pad};
+        SGM::Node broadcast_concat{
+                opr::Concat::typeinfo(),
+                {in_node, broadcast_or_immutable},
+                [](OperatorNodeBase* opr) {
+                    auto concat_pad = opr->try_cast_final<opr::Concat>();
+                    return concat_pad->axis() == 1;
+                }};
+
+        SGM::Node nchwx_reshape{opr::Reshape::typeinfo(),
+                                {broadcast_concat, SGM::Node(nullptr)},
+                                [](OperatorNodeBase* opr) {
+                                    auto inp0 = opr->input()[0];
+                                    return is_shape_nchw(inp0->shape());
+                                }};
+        SGM::Node shuffle_root{
+                opr::Dimshuffle::typeinfo(),
+                {nchwx_reshape},
+                [](OperatorNodeBase* opr) {
+                    auto& shuffle_opr = opr->cast_final<opr::Dimshuffle>();
+                    auto& input_vec = shuffle_opr.input();
+                    return is_shape_before_nchw4(input_vec[0]->shape()) &&
+                           is_nchw_nchw4_shuffle_vec(shuffle_opr.param());
+                }};
+        return shuffle_root;
+    };
+    auto replace_shuffle_opr = [&](OperatorNodeBase* opr,
+                                   const VarNodeArray& new_inp,
+                                   SubGraph::Rewriter& rewriter,
+                                   ReaderType& reader) {
+        SGM matcher;
+        OperatorNodeBase* src_node = nullptr;
+        SGM::Node input_data_cp{
+                nullptr, [&](OperatorNodeBase* opr) {
+                    auto src_dtype = opr->output()[0]->dtype();
+                    if (src_dtype.enumv() == DTypeEnum::Quantized8Asymm) {
+                        src_node = opr;
+                        return true;
+                    } else {
+                        return false;
+                    }
+                }};
+        SGM::Node type_cvt{opr::TypeCvt::typeinfo(), {input_data_cp}};
+        SGM::Node::CallBack const_pad_cbk = [&](OperatorNodeBase* opr) {
+            bool is_fp32_pad = is_immutable_all_equal<dtype::Float32>(opr, 0);
+            bool is_i32_pad = is_immutable_all_equal<dtype::Int32>(opr, 0);
+            bool is_q8_pad = is_immutable_all_equal<dtype::QuantizedS8>(
+                    opr, dt_qint8(0));
+            return is_fp32_pad || is_i32_pad || is_q8_pad;
+        };
+        SGM::Node::CallBack const_reshape_cbk = [](OperatorNodeBase* opr) {
+            return true;
+        };
+        auto&& shuffle_root = gen_pad_dimshuffle_graph(type_cvt, const_pad_cbk,
+                                                       const_reshape_cbk);
+        bool match = matcher.match(shuffle_root, opr);
+        bool check_ok = false;
+        if (match) {
+            check_ok =
+                    SubGraphChecker().check({src_node}, opr, src_node, reader);
+        }
+        if (match && check_ok) {
+            opr::RelayoutFormat::Param param;
+            param.mode = opr::RelayoutFormat::Param::Mode::NCHW_NCHW4;
+            OperatorNodeConfig config(opr->output()[0]->dtype());
+            auto out_node = opr::RelayoutFormat::make(
+                    rewriter.get_var(src_node->output()[0]), param.mode,
+                    config);
+            return out_node.node()->owner_opr();
+        } else {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
+    };
+
+    auto replace_astype_opr = [&](OperatorNodeBase* opr,
+                                  const VarNodeArray& new_inp,
+                                  SubGraph::Rewriter& rewriter,
+                                  ReaderType& reader) {
+        SGM matcher;
+        OperatorNodeBase* src_node = nullptr;
+        OperatorNodeBase* neg_128_immu_node = nullptr;
+        OperatorNodeBase* pad0_immu_node = nullptr;
+        OperatorNodeBase* const_reshape_last_dim_node = nullptr;
+        SGM::Node input_data_cp{nullptr, [&](OperatorNodeBase* opr) {
+                                    auto src_dtype = opr->output()[0]->dtype();
+                                    if (src_dtype.enumv() == DTypeEnum::Uint8) {
+                                        src_node = opr;
+                                        return true;
+                                    } else {
+                                        return false;
+                                    }
+                                }};
+        SGM::Node cvt_fp32{opr::TypeCvt::typeinfo(),
+                           {input_data_cp},
+                           [](OperatorNodeBase* opr) {
+                               auto cvt_op =
+                                       opr->try_cast_final<opr::TypeCvt>();
+                               bool is_fp32 = cvt_op->param().enumv() ==
+                                              DTypeEnum::Float32;
+                               return is_fp32;
+                           }};
+        SGM::Node sub_128{
+                opr::Elemwise::typeinfo(),
+                {cvt_fp32},
+                [&](OperatorNodeBase* opr) {
+                    auto elem_op = opr->try_cast_final<opr::Elemwise>();
+                    bool is_add_op = elem_op->param().mode ==
+                                     opr::Elemwise::Param::Mode::ADD;
+                    auto neg_128_op = elem_op->input()[1]->owner_opr();
+                    bool is_neg_128 = is_immutable_equal(neg_128_op, -128.f,
+                                                         DTypeEnum::Float32);
+                    neg_128_immu_node = is_neg_128 ? neg_128_op : nullptr;
+                    return is_add_op && is_neg_128;
+                }};
+        SGM::Node::CallBack const_pad_cbk = [&](OperatorNodeBase* opr) {
+            pad0_immu_node = opr;
+            bool is_fp32_pad = is_immutable_all_equal<dtype::Float32>(opr, 0);
+            bool is_i32_pad = is_immutable_all_equal<dtype::Int32>(opr, 0);
+            return is_fp32_pad || is_i32_pad;
+        };
+        SGM::Node::CallBack const_reshape_cbk = [&](OperatorNodeBase* opr) {
+            const_reshape_last_dim_node = opr;
+            return true;
+        };
+        auto&& shuffle_root = gen_pad_dimshuffle_graph(sub_128, const_pad_cbk,
+                                                       const_reshape_cbk);
+
+        SGM::Node astype_root{opr::TypeCvt::typeinfo(), {shuffle_root}};
+        bool match = matcher.match(astype_root, opr);
+        bool check_ok = false;
+        if (match) {
+            check_ok = SubGraphChecker().check(
+                    {src_node, neg_128_immu_node, pad0_immu_node,
+                     const_reshape_last_dim_node},
+                    opr, src_node, reader);
+        }
+        if (match && check_ok) {
+            opr::RelayoutFormat::Param param;
+            param.mode = opr::RelayoutFormat::Param::Mode::NCHW_NCHW4;
+            OperatorNodeConfig config(opr->output()[0]->dtype());
+            auto out_node = opr::RelayoutFormat::make(
+                    rewriter.get_var(src_node->output()[0]), param.mode,
+                    config);
+            return out_node.node()->owner_opr();
+        } else {
+            return serialization::copy_opr_shallow(*opr, new_inp,
+                                                   opr->config());
+        }
+    };
+    auto ret = std::make_unique<FuseNCHW4Int8Preprocess>();
+    auto&& replace_func = ret->m_opr_replace_func;
+
+    MGB_MARK_USED_VAR(replace_astype_opr);
+    MGB_MARK_USED_VAR(replace_shuffle_opr);
+    replace_func[opr::Dimshuffle::typeinfo()] = replace_shuffle_opr;
+    replace_func[opr::TypeCvt::typeinfo()] = replace_astype_opr;
+    return ret;
+}
+
+void FuseNCHW4Int8Preprocess::apply(OptState& state) const {
+    state.set_var_replace_check_flag(VarReplaceCheckFlag::CHECK_DTYPE |
+                                     VarReplaceCheckFlag::CHECK_SHAPE);
+    auto rewriter = state.graph().make_rewriter();
+    VarNodeArray new_inp_cache;
+
+    ReaderType readers;
+    state.graph().iter([&readers](OperatorNodeBase* opr) {
+        for (auto&& i : opr->node_prop().dep_map()) {
+            readers[i.first->owner_opr()].emplace_back(opr, i.second);
+        }
+    });
+
+    auto on_opr = [this, &rewriter, &new_inp_cache,
+                   &readers](OperatorNodeBase* opr) {
+        auto it = m_opr_replace_func.find(opr->dyn_typeinfo());
+
+        if (it != m_opr_replace_func.end()) {
+            auto&& new_inp = new_inp_cache;
+            new_inp.clear();
+            new_inp.reserve(opr->input().size());
+            for (auto i : opr->input()) {
+                new_inp.push_back(rewriter.get_var(i));
+            }
+            auto new_opr = (it->second)(opr, new_inp, rewriter, readers);
+            if (new_opr->try_cast_final<opr::RelayoutFormat>()) {
+                auto &&origin_out = opr->output(),
+                     &&cur_out = new_opr->output();
+                rewriter.replace_var(origin_out[0], cur_out[0], nullptr);
+            } else {
+                auto &&origin_out = opr->output(),
+                     &&cur_out = new_opr->output();
+                mgb_assert(origin_out.size() == cur_out.size(),
+                           "bad opr replace: src=%s{%s} dst=%s{%s}, %zu != %zu",
+                           opr->cname(), opr->dyn_typeinfo()->name,
+                           new_opr->cname(), new_opr->dyn_typeinfo()->name,
+                           origin_out.size(), cur_out.size());
+                for (size_t i = 0; i < origin_out.size(); i++) {
+                    rewriter.replace_var(origin_out[i], cur_out[i], nullptr);
+                }
+            }
+        } else {
+            rewriter.auto_replace_outputs(opr);
+        }
+    };
+    state.graph().iter(on_opr);
+    rewriter.apply_inplace();
+}
\ No newline at end of file
--- a/src/gopt/include/megbrain/gopt/inference.h
+++ b/src/gopt/include/megbrain/gopt/inference.h
@@ -152,6 +152,26 @@ namespace gopt {
        void apply(OptState& opt) const override;
    };

+    /*!
+     * \brief fuse preprocess, like pad channel, quint8 to qint8
+     */
+    class FuseNCHW4Int8Preprocess : public Pass {
+    public:
+        const char* name() const override;
+        void apply(OptState& opt) const override;
+        static std::unique_ptr<FuseNCHW4Int8Preprocess> make();
+        using DepType = cg::OperatorNodeProp::DepType;
+        using ReaderType =
+                ThinHashMap<OperatorNodeBase*,
+                            SmallVector<std::pair<OperatorNodeBase*, DepType>>>;
+
+    private:
+        ThinHashMap<Typeinfo*, thin_function<OperatorNodeBase*(
+                                       OperatorNodeBase*, const VarNodeArray&,
+                                       SubGraph::Rewriter&, ReaderType&)>>
+                m_opr_replace_func;
+    };
+
    /*!
     * \brief fuse deconv and typecvt to a deconv opr
     */

--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -719,15 +719,15 @@ TEST(TestGoptInference, Float16IOFloat32ComputeDeConv) {
    };
    graph->options().graph_opt_level = 0;

-    auto s0 = mkvar("s0", {5, 5, 3, 3}),
-         s1 = mkvar("s1", {1, 5, INP_H, INP_W});
+    auto s0 = mkvar("s0", {5, 5, 3, 3}), s1 = mkvar("s1", {1, 5, INP_H, INP_W});
    auto y = opr::ConvolutionBackwardData::make(s0, s1, {}, {});
    SymbolVar y_opt;
    auto options = gopt::OptimizeForInferenceOptions{};
    options.enable_f16_io_f32_comp();
    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
-    ASSERT_EQ(find_opr<opr::ConvolutionBackwardData>(y_opt).param().compute_mode,
-              opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
+    ASSERT_EQ(
+            find_opr<opr::ConvolutionBackwardData>(y_opt).param().compute_mode,
+            opr::ConvBias::Param::ConvBias::ComputeMode::FLOAT32);
    ASSERT_EQ(y_opt.dtype(), dtype::Float32());

    HostTensorND host_y, host_y_opt;
@@ -1603,7 +1603,6 @@ TEST(TestGoptInference, ConvBiasNonlinearityFusePass_FullBias) {
    }
 }

-
 TEST(TestGoptInference, ParamMerge) {
    auto cns = load_multiple_xpus(2);
    HostTensorGenerator<> gen;
@@ -3364,14 +3363,14 @@ TEST(TestGoptInference, ConvertFormatNCHW44MultiInput) {

    auto b = mkvar("b", {1, 1, 16, 16}),
         elem0 = opr::Elemwise::make({conv1 + b + b},
-                                 opr::Elemwise::Param::Mode::RELU);
+                                     opr::Elemwise::Param::Mode::RELU);

    auto w2 = mkcvar("w2", {8, 8, 3, 3}),
         conv2 = opr::Convolution::make(elem0, w2, param_conv);

    auto b1 = mkvar("b1", {1}),
         y = opr::Elemwise::make({conv2 + b1 + b},
-                                     opr::Elemwise::Param::Mode::RELU);
+                                 opr::Elemwise::Param::Mode::RELU);

    SymbolVar y_opt;
    auto options = gopt::OptimizeForInferenceOptions{};
@@ -3631,4 +3630,97 @@ TEST(TestGoptInference, ConvertFormatCD4GroupOneConv) {
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-3);
 }

+#if MGB_CUDA
+
+TEST(TestGoptInference, PreProcessCase0) {
+    REQUIRE_GPU(1);
+    HostTensorGenerator<dtype::Quantized8Asymm, RandomDistribution::UNIFORM>
+            gen(dt_quint8(0), dt_quint8(50), 1, 128, 1234);
+    auto cn = CompNode::load("gpu0");
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+
+    size_t n = 1;
+    size_t c = 3;
+    size_t h = 16;
+    size_t w = 16;
+    auto host_x1 = gen({n, c, h, w}, cn);
+
+    auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
+    auto x_q8 = opr::TypeCvt::make(x, dtype::QuantizedS8(1.f), cn);
+    auto zero = DTypeScalar(dtype::QuantizedS8(1.f));
+    auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
+    auto pad_channel_tensor =
+            opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
+    auto paded_x = opr::Concat::make({x_q8, pad_channel_tensor}, 1, cn)
+                           .reshape({n, 1, 4, h, w});
+
+    auto result = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
+
+    auto y = result;
+    SymbolVar y_opt;
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_fuse_preprocess();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
+
+    graph->compile({{y_opt, {}}})
+            ->to_json()
+            ->writeto_fpath(
+                    output_file("TestGoptInference.PreProcessCase0.json"));
+
+    HostTensorND host_y_opt, host_y;
+    auto func = graph->compile({make_callback_copy(y, host_y),
+                                make_callback_copy(y_opt, host_y_opt)});
+    func->execute();
+    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
+
+    ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::RelayoutFormat>());
+}
+
+TEST(TestGoptInference, PreProcessCase1) {
+    REQUIRE_GPU(1);
+    HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
+    auto cn = CompNode::load("gpu0");
+    auto graph = ComputingGraph::make();
+    graph->options().graph_opt_level = 0;
+
+    size_t n = 1;
+    size_t c = 3;
+    size_t h = 16;
+    size_t w = 16;
+    auto host_x1 = gen({n, c, h, w}, cn);
+
+    auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
+    auto x_u8 = opr::TypeCvt::make(x, dtype::Float32(), cn);
+    auto x_s8 = x_u8 - 128;
+    auto zero = DTypeScalar(dtype::Float32());
+    auto zero_tensor = opr::ImmutableTensor::make(*graph, zero, cn);
+    auto pad_channel_tensor =
+            opr::Broadcast::make(zero_tensor, {n, 1, h, w}, cn);
+    auto paded_x = opr::Concat::make({x_s8, pad_channel_tensor}, 1, cn)
+                           .reshape({n, 1, 4, h, w});
+
+    auto nchw4_out = opr::Dimshuffle::make(paded_x, {0, 1, 3, 4, 2}, 5, cn);
+    auto result = opr::TypeCvt::make(nchw4_out, dtype::QuantizedS8(1.f));
+
+    auto y = result;
+    SymbolVar y_opt;
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_fuse_preprocess();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
+
+    graph->compile({{y_opt, {}}})
+            ->to_json()
+            ->writeto_fpath(
+                    output_file("TestGoptInference.PreProcessCase1.json"));
+
+    HostTensorND host_y_opt, host_y;
+    auto func = graph->compile({make_callback_copy(y, host_y),
+                                make_callback_copy(y_opt, host_y_opt)});
+    func->execute();
+    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
+
+    ASSERT_TRUE(y_opt.node()->owner_opr()->same_type<opr::RelayoutFormat>());
+}
+#endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/opr/impl/basic_arith.cpp
+++ b/src/opr/impl/basic_arith.cpp
@@ -198,7 +198,7 @@ Elemwise::Elemwise(
                           param.mode == Param::Mode::MAX ||
                           param.mode == Param::Mode::MIN,
                   "Only ADD, SUB, NEGATE, RELU, MAX and MIN is guaranteed "
-                   "to be supported on Elemwise for quantized DType");
+                   "to be supported on Elemwise for quantized DType, no support %d", (int)param.mode);
    }
 }


--- a/src/opr/impl/tensor_manip.cpp
+++ b/src/opr/impl/tensor_manip.cpp
@@ -1578,6 +1578,23 @@ MGB_IMPL_OPR_GRAD(ParamPackSplit) {
 // f}}}

 /* f{{{ ======================= RelayoutFormat ======================= */
+namespace mgb {
+namespace opr {
+namespace intl {
+template <>
+struct MegDNNOprInitPostCtor<RelayoutFormat> {
+    static void apply(cg::OperatorNodeBase& opr) {
+        if (opr.config().output_dtype().valid()) {            
+            opr.output(0)->dtype(opr.config().output_dtype());
+        } else {
+            opr.output(0)->dtype(opr.input(0)->dtype());
+        }
+    }
+};
+}  // namespace intl
+}  // namespace opr
+}  // namespace mgb
+
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(RelayoutFormat);
 MEGDNN_OPR_INIT1(RelayoutFormat, "relayout_format")


--- a/test/src/helper.cpp
+++ b/test/src/helper.cpp
@@ -190,6 +190,24 @@ namespace mgb {
        }
        return ret;
    }
+
+    std::shared_ptr<HostTensorND>
+    HostTensorGenerator<dtype::Quantized8Asymm, RandomDistribution::UNIFORM>::
+    operator()(const TensorShape& shape, CompNode cn) {
+        if (!cn.valid())
+            cn = CompNode::load("xpu0");
+        auto dtype = dtype::Quantized8Asymm(m_scale, m_zero_point);
+        auto param = dtype.param();
+        std::shared_ptr<HostTensorND> ret =
+                std::make_shared<HostTensorND>(cn, shape, dtype);
+        auto ptr = ret->ptr<dt_quint8>();
+        double scale = (param.dequantize(m_hi) - param.dequantize(m_lo)) /
+                       (m_rng.max() + 1.0);
+        for (size_t i = 0, it = shape.total_nr_elems(); i < it; ++i) {
+            ptr[i] = param.quantize(m_rng() * scale + param.dequantize(m_lo));           
+        }
+        return ret;
+    }
 }

 ::testing::AssertionResult mgb::__assert_float_equal(

--- a/test/src/include/megbrain/test/helper.h
+++ b/test/src/include/megbrain/test/helper.h
@@ -264,6 +264,10 @@ struct UniformRNGDefaultRange<dtype::QuantizedS8> {
    static const dt_qint8 LO, HI;
 };

+template<>
+struct UniformRNGDefaultRange<dtype::Quantized8Asymm> {
+    static const dt_quint8 LO, HI;
+};
 //! gaussian
 template<class dtype>
 class HostTensorGenerator<dtype, RandomDistribution::GAUSSIAN> final:
@@ -404,6 +408,33 @@ class HostTensorGenerator<dtype::QuantizedS8, RandomDistribution::UNIFORM> final
        ctype m_lo, m_hi;
 };

+template <>
+class HostTensorGenerator<dtype::Quantized8Asymm, RandomDistribution::UNIFORM>
+        final : public HostTensorGeneratorBase {
+public:
+    using ctype = typename DTypeTrait<dtype::Quantized8Asymm>::ctype;
+
+    HostTensorGenerator(
+            ctype lo = UniformRNGDefaultRange<dtype::Quantized8Asymm>::LO,
+            ctype hi = UniformRNGDefaultRange<dtype::Quantized8Asymm>::HI,
+            float scale = 1.f, uint8_t zero_point = 0,
+            uint64_t seed = next_rand_seed())
+            : HostTensorGeneratorBase{seed},
+              m_scale{scale},
+              m_zero_point(zero_point),
+              m_lo{lo},
+              m_hi{hi} {}
+
+    std::shared_ptr<HostTensorND> operator()(const TensorShape& shape,
+                                             CompNode cn = {}) override;
+    using HostTensorGeneratorBase::operator();
+
+private:
+    float m_scale;
+    uint8_t m_zero_point;
+    ctype m_lo, m_hi;
+};
+
 /*!
 * \brief get output file name in test output dir
 * \param check_writable whether to ensure the file is writable