feat(dnn): add nchw_fp32 nchw44_qint8 cuda dct

GitOrigin-RevId: 581e31fc209008418f9821f32f7c71db76f84ddf

feat(dnn): add nchw_fp32 nchw44_qint8 cuda dct
GitOrigin-RevId: 581e31fc209008418f9821f32f7c71db76f84ddf
ba66e1d0 · Megvii Engine Team · b3278229 · ba66e1d0 · ba66e1d0 · ba66e1d0
20 changed file
--- a/dnn/include/megdnn/oprs/imgproc.h
+++ b/dnn/include/megdnn/oprs/imgproc.h
@@ -182,6 +182,48 @@ class WarpPerspectiveBackwardMat: public WarpPerspectiveBase {
                size_t workspace_in_bytes);
 };

+class DctChannelSelectForward : public OperatorBase {
+    DEF_OPR_PARAM(DctChannelSelect);
+    DEF_OPR_IMPL(DctChannelSelectForward, OperatorBase, 3, 1);
+
+public:
+    /**
+     * \param[in] DctChannelSelectForward input, must be uint8 nchw tensor
+     * \param[in] mask_offset input, must be int32 nchw tensor
+     * \param[in] mask_val input, must be int32 nchw tensor
+     * \param[dst] DctChannelSelectForward output, default fp32 nchw tensor
+     * \param[out] workspace temporary workspace to perform forward
+     */
+    virtual void exec(_megdnn_tensor_in src, 
+                      _megdnn_tensor_in mask_offset, 
+                      _megdnn_tensor_in mask_val, 
+                      _megdnn_tensor_out dst, 
+                      _megdnn_workspace workspace) = 0;
+
+    void deduce_layout(const TensorLayout& src, 
+                       const TensorLayout& mask_offset,
+                       const TensorLayout& mask_val, 
+                       TensorLayout& dst);
+                       
+    virtual size_t get_workspace_in_bytes(const TensorLayout& src, 
+                                          const TensorLayout& mask_offset,
+                                          const TensorLayout& mask_val, 
+                                          const TensorLayout& dst) = 0;
+
+protected:
+    void check_layout_fwd(const TensorLayout& src, 
+                          const TensorLayout& mask_offset,
+                          const TensorLayout& mask_val, 
+                          const TensorLayout& dst);
+                          
+    void deduce_layout_fwd(const TensorLayout& src, 
+                           const TensorLayout& mask_offset,
+                           const TensorLayout& mask_val, 
+                           TensorLayout& dst);
+
+    std::string param_msg() const;
+};
+
 } // namespace megdnn

 #include "megdnn/internal/opr_header_epilogue.h"

--- a/dnn/scripts/opr_param_defs.py
+++ b/dnn/scripts/opr_param_defs.py
@@ -411,6 +411,9 @@ pdef('ElemwiseMultiType').add_enum(

 pdef('PowC', 'power with constant exponent').add_fields('float32', 'exp', 0)

+(pdef('DctChannelSelect', '2d discrete cosine transform').add_enum_alias('Format', 'ConvolutionV0').
+ add_enum('FastImpl', 'NONE', 'FIX_32_MASK').add_fields('int32', 'dct_block_size', 8))
+
 (pdef('MatrixMul', version=0, is_legacy=True).
 add_fields('bool', 'transposeA', 'false', 'transposeB', 'false').
 add_enum('DataType',

--- a/dnn/src/common/dct.cpp
+++ b/dnn/src/common/dct.cpp
+/**
+ * \file dnn/src/common/dct.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "megdnn/oprs.h"
+
+#include "src/common/utils.h"
+
+namespace megdnn {
+
+void DctChannelSelectForward::deduce_layout_fwd(const TensorLayout& src,
+                                                const TensorLayout& mask_offset,
+                                                const TensorLayout& mask_val,
+                                                TensorLayout& dst) {
+    const size_t dct_block = param().dct_block_size;
+    const size_t in = src.shape[0];
+    const size_t ic = src.shape[1];
+    const size_t ih = src.shape[2];
+    const size_t iw = src.shape[3];
+    check_layout_fwd(src, mask_offset, mask_val, dst);
+    const size_t oh = ih / dct_block;
+    const size_t ow = iw / dct_block;
+    //! mask will be empty or (ic + 1) elements
+    size_t oc = mask_offset.ndim > 0 && mask_offset[0] >= 2
+                        ? mask_val.shape[0]
+                        : ic * dct_block * dct_block;
+    if (param().fastImpl == Param::FastImpl::FIX_32_MASK) {
+        megdnn_assert(oc == 32,
+                      "Param::FastImpl::FIX_32_MASK oc must be 32, but %zu",
+                      oc);
+    }
+    if (param().format == Param::Format::NCHW) {
+        dst = TensorLayout(TensorShape({in, oc, oh, ow}), dst.dtype);
+    } else {
+        megdnn_assert(param().format == Param::Format::NCHW4,
+                      "dct format must be nchw or nchw4");
+        megdnn_assert(oc % 4 == 0, "oc mod 4 == 0 in nchw4");
+        dst = TensorLayout(TensorShape({in, oc / 4, oh, ow, 4}), dst.dtype);
+    }
+}
+
+void DctChannelSelectForward::deduce_layout(const TensorLayout& src,
+                                            const TensorLayout& mask_offset,
+                                            const TensorLayout& mask_val,
+                                            TensorLayout& dst) {
+    deduce_layout_fwd(src, mask_offset, mask_val, dst);
+}
+
+void DctChannelSelectForward::check_layout_fwd(const TensorLayout& src,
+                                               const TensorLayout& mask_offset,
+                                               const TensorLayout& mask_val,
+                                               const TensorLayout& dst) {
+    const size_t dct_block = param().dct_block_size;
+    const size_t ih = src.shape[2];
+    const size_t iw = src.shape[3];
+
+    megdnn_assert(mask_offset.ndim == 0 || (mask_offset.ndim == 1 &&
+                                            (mask_offset.shape[0] == 0 ||
+                                             mask_offset.shape[0] >= 2) &&
+                                            mask_val.ndim == 1),
+                  "mask only support one valid dim");
+    megdnn_assert(mask_val.ndim <= 1, "only support one dim");
+    megdnn_assert(src.dtype.enumv() == DTypeEnum::Uint8,
+                  "src.dtype == dtype::Uint8");
+    megdnn_assert(dst.dtype.enumv() == DTypeEnum::Float32 ||
+                          dst.dtype.enumv() == DTypeEnum::QuantizedS8,
+                  "dst.dtype == dtype::Float32 || dst.dtype.enumv() == "
+                  "DTypeEnum::QuantizedS8");
+    megdnn_assert(ih % dct_block == 0, "ih mod dctblock == 0");
+    megdnn_assert(iw % dct_block == 0, "iw mod dctblock == 0");
+}
+
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -201,6 +201,7 @@ private:
    cb(RemapBackwardMat) \
    cb(AdaptivePoolingForward) \
    cb(AdaptivePoolingBackward) \
+    cb(DctChannelSelectForward) 

 /*!
 * \brief specialize HandleImpl::create_operator for a single opr type;

--- a/dnn/src/cuda/dct/dct_channel_select.cu
+++ b/dnn/src/cuda/dct/dct_channel_select.cu
+/**
+ * \file dnn/src/cuda/dct/dct_channel_select.cu
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+
+#include "megcore_cdefs.h"
+#include "src/cuda/dct/dct_channel_select.cuh"
+#include "src/cuda/error_info.cuh"
+
+namespace megdnn {
+namespace cuda {
+
+template <typename T>
+struct CudaPostProcess;
+
+template <>
+struct CudaPostProcess<float> {
+    CudaPostProcess(float){};
+    static inline __device__ float func(float val) { return val; }
+};
+
+template <>
+struct CudaPostProcess<int8_t> {
+    CudaDTypeParamImpl<dt_qint8> m_type_cvt;
+    CudaPostProcess(float scale) { m_type_cvt.inv_scale = 1.f / scale; };
+    inline __device__ int8_t func(float val) {
+        return m_type_cvt.quantize(val).as_int8();
+    }
+};
+
+template <uint32_t format>
+struct ChannelBlockHelper;
+template <>
+struct ChannelBlockHelper<dct::DctLayoutFormat::NCHW4> {
+    static constexpr int channel_block = 4;
+};
+template <>
+struct ChannelBlockHelper<dct::DctLayoutFormat::NCHW> {
+    static constexpr int channel_block = 1;
+};
+
+namespace dct {
+namespace {
+inline __device__ void load_row(float (&row_cache)[8], const uint8_t* src) {
+    int2 row = *((int2*)src);
+    row_cache[0] = (float)(((uchar4*)&(row.x))->x);
+    row_cache[1] = (float)(((uchar4*)&(row.x))->y);
+    row_cache[2] = (float)(((uchar4*)&(row.x))->z);
+    row_cache[3] = (float)(((uchar4*)&(row.x))->w);
+    row_cache[4] = (float)(((uchar4*)&(row.y))->x);
+    row_cache[5] = (float)(((uchar4*)&(row.y))->y);
+    row_cache[6] = (float)(((uchar4*)&(row.y))->z);
+    row_cache[7] = (float)(((uchar4*)&(row.y))->w);
+}
+
+inline __device__ void fast_dct_1d_internel(float& src0, float& src1,
+                                            float& src2, float& src3,
+                                            float& src4, float& src5,
+                                            float& src6, float& src7) {
+    constexpr float rsqrt_8 = 0.3535533905932737f;  //!< rsqrt_8 = sqrt(1 / 8)
+    constexpr float a = 1.387039845322148f;  //!< a = sqrt2 * cos(pi * 1 / 16)
+    constexpr float b = 1.306562964876377f;  //!< b = sqrt2 * cos(pi * 2 / 16)
+    constexpr float c = 1.175875602419359f;  //!< c = sqrt2 * cos(pi * 3 / 16)
+    constexpr float d = 0.785694958387102f;  //!< d = sqrt2 * cos(pi * 5 / 16)
+    constexpr float e = 0.541196100146197f;  //!< e = sqrt2 * cos(pi * 6 / 16)
+    constexpr float f = 0.275899379282943f;  //!< f = sqrt2 * cos(pi * 7 / 16)
+
+    const float add_0_7 = src0 + src7;
+    const float add_1_6 = src1 + src6;
+    const float add_2_5 = src2 + src5;
+    const float add_3_4 = src3 + src4;
+    const float sub_0_7 = src0 - src7;
+    const float sub_6_1 = src6 - src1;
+    const float sub_2_5 = src2 - src5;
+    const float sub_4_3 = src4 - src3;
+
+    const float add_0_7_3_4 = add_0_7 + add_3_4;
+    const float add_1_6_2_5 = add_1_6 + add_2_5;
+    const float add_0_7_sub_3_4 = add_0_7 - add_3_4;
+    const float add_1_6_sub_2_5 = add_1_6 - add_2_5;
+
+    src0 = rsqrt_8 * (add_0_7_3_4 + add_1_6_2_5);
+    src2 = rsqrt_8 * (b * add_0_7_sub_3_4 + e * add_1_6_sub_2_5);
+    src4 = rsqrt_8 * (add_0_7_3_4 - add_1_6_2_5);
+    src6 = rsqrt_8 * (e * add_0_7_sub_3_4 - b * add_1_6_sub_2_5);
+
+    src1 = rsqrt_8 * (a * sub_0_7 - c * sub_6_1 + d * sub_2_5 - f * sub_4_3);
+    src3 = rsqrt_8 * (c * sub_0_7 + f * sub_6_1 - a * sub_2_5 + d * sub_4_3);
+    src5 = rsqrt_8 * (d * sub_0_7 + a * sub_6_1 + f * sub_2_5 - c * sub_4_3);
+    src7 = rsqrt_8 * (f * sub_0_7 + d * sub_6_1 + c * sub_2_5 + a * sub_4_3);
+}
+
+inline __device__ void fast_dct_1d(float (&src)[8]) {
+    fast_dct_1d_internel(src[0], src[1], src[2], src[3], src[4], src[5], src[6],
+                         src[7]);
+}
+
+inline __device__ void fast_dct_1d_col(float (&src)[8][8], const int col) {
+    fast_dct_1d_internel(src[0][col], src[1][col], src[2][col], src[3][col],
+                         src[4][col], src[5][col], src[6][col], src[7][col]);
+}
+enum class MaskType {
+    NO_MASK = 0,
+    USER_DEFINE_MASK = 1,
+    FIX_32_MASK = 2,
+    MASK_END
+};
+template <const int dct_block, const int block_oh, const int block_ow,
+          uint32_t format, MaskType mask_type, typename DstDtype, typename T2>
+struct StoreMask;
+
+template <const int dct_block, const int block_oh, const int block_ow,
+          typename T2>
+struct StoreMask<dct_block, block_oh, block_ow, DctLayoutFormat::NCHW,
+                 MaskType::USER_DEFINE_MASK, float, T2> {
+    static inline __device__ void func(
+            const float (&thread_cache)[dct_block][dct_block], float* dst_tid,
+            const int oc_stride, int channel_idx, const int* mask_offset,
+            const int* mask_val, CudaPostProcess<T2>& quant_param,
+            megcore::AsyncErrorInfo* error_info, void* error_tracker) {
+        __shared__ float shared[dct_block][dct_block][block_oh][block_ow];
+#pragma unroll
+        for (int i = 0; i < dct_block; ++i)
+#pragma unroll
+            for (int j = 0; j < dct_block; ++j) {
+                shared[i][j][threadIdx.y][threadIdx.x] = thread_cache[i][j];
+            }
+        const int store_channel_offset = mask_offset[channel_idx];
+        const int nr_store_channel =
+                mask_offset[channel_idx + 1] - store_channel_offset;
+        if (nr_store_channel < 0) {
+            set_async_error_info(error_info, error_tracker,
+                                 "nchw sub mask len must > 0");
+        }
+        for (int store_channel_idx = 0; store_channel_idx < nr_store_channel;
+             ++store_channel_idx) {
+            const int index =
+                    mask_val[store_channel_offset + store_channel_idx];
+            dst_tid[store_channel_idx * oc_stride] =
+                    shared[index / dct_block][index % dct_block][threadIdx.y]
+                          [threadIdx.x];
+        }
+    }
+};
+
+template <const int dct_block, const int block_oh, const int block_ow,
+          typename T2>
+struct StoreMask<dct_block, block_oh, block_ow, DctLayoutFormat::NCHW4,
+                 MaskType::USER_DEFINE_MASK, int8_t, T2> {
+    static inline __device__ void func(
+            const float (&thread_cache)[dct_block][dct_block], int8_t* dst_tid,
+            const int oc_stride, int channel_idx, const int* mask_offset,
+            const int* mask_val, CudaPostProcess<T2>& quant_param,
+            megcore::AsyncErrorInfo* error_info, void* error_tracker) {
+        //! nchw4 channel_block is 4
+        constexpr int channel_block =
+                ChannelBlockHelper<DctLayoutFormat::NCHW4>::channel_block;
+        __shared__ float shared[dct_block][dct_block][block_oh][block_ow];
+#pragma unroll
+        for (int i = 0; i < dct_block; ++i)
+#pragma unroll
+            for (int j = 0; j < dct_block; ++j) {
+                shared[i][j][threadIdx.y][threadIdx.x] = thread_cache[i][j];
+            }
+        const int store_channel_offset = mask_offset[channel_idx];
+        const int nr_store_channel =
+                mask_offset[channel_idx + 1] - store_channel_offset;
+        if (nr_store_channel % 4 != 0 || nr_store_channel < 0) {
+            set_async_error_info(error_info, error_tracker,
+                                 "nchw4 sub_mask_len mod 4 should be 0 and "
+                                 "sub_mask_len must > 0");
+        }
+        for (int store_channel_idx = 0; store_channel_idx < nr_store_channel;
+             store_channel_idx += channel_block) {
+            const int index0 =
+                    mask_val[store_channel_offset + store_channel_idx];
+            const int index1 =
+                    mask_val[store_channel_offset + store_channel_idx + 1];
+            const int index2 =
+                    mask_val[store_channel_offset + store_channel_idx + 2];
+            const int index3 =
+                    mask_val[store_channel_offset + store_channel_idx + 3];
+            const int store_c4_idx = store_channel_idx / channel_block;
+            *(char4*)(&dst_tid[store_c4_idx * channel_block * oc_stride]) = {
+                    quant_param.func(
+                            shared[index0 / dct_block][index0 % dct_block]
+                                  [threadIdx.y][threadIdx.x]),
+                    quant_param.func(
+                            shared[index1 / dct_block][index1 % dct_block]
+                                  [threadIdx.y][threadIdx.x]),
+                    quant_param.func(
+                            shared[index2 / dct_block][index2 % dct_block]
+                                  [threadIdx.y][threadIdx.x]),
+                    quant_param.func(
+                            shared[index3 / dct_block][index3 % dct_block]
+                                  [threadIdx.y][threadIdx.x])};
+        }
+    }
+};
+
+template <const int dct_block, const int block_oh, const int block_ow,
+          uint32_t format, typename DstDtype, typename T2>
+struct StoreMask<dct_block, block_oh, block_ow, format, MaskType::NO_MASK,
+                 DstDtype, T2> {
+    static inline __device__ void func(
+            const float (&thread_cache)[dct_block][dct_block],
+            DstDtype* dst_tid, const int oc_stride, int channel_idx,
+            const int* mask_offset, const int* mask_val,
+            CudaPostProcess<T2>& quant_param,
+            megcore::AsyncErrorInfo* error_info, void* error_tracker) {
+        constexpr int channel_block = ChannelBlockHelper<format>::channel_block;
+#pragma unroll
+        for (int i = 0; i < dct_block; i++) {
+#pragma unroll
+            for (int j = 0; j < dct_block; j++) {
+                dst_tid[(i * dct_block + j) / channel_block * channel_block *
+                                oc_stride +
+                        (i * dct_block + j) % channel_block] =
+                        quant_param.func(thread_cache[i][j]);
+            }
+        }
+    }
+};
+
+template <const int dct_block, const int block_oh, const int block_ow,
+          typename T2>
+struct StoreMask<dct_block, block_oh, block_ow, DctLayoutFormat::NCHW,
+                 MaskType::FIX_32_MASK, float, T2> {
+    static inline __device__ void func(
+            const float (&thread_cache)[dct_block][dct_block], float* dst_tid,
+            const int oc_stride, int channel_idx, const int* mask_offset,
+            const int* mask_val, CudaPostProcess<T2>& quant_param,
+            megcore::AsyncErrorInfo* error_info, void* error_tracker) {
+#define STORE(store_index, index)      \
+    dst_tid[store_index * oc_stride] = \
+            thread_cache[index / dct_block][index % dct_block]
+
+        STORE(0, 0);
+        STORE(1, 1);
+        STORE(2, 8);
+        STORE(3, 16);
+        STORE(4, 9);
+        STORE(5, 2);
+        STORE(6, 3);
+        STORE(7, 10);
+
+        if (channel_idx == 0) {
+            STORE(8, 17);
+            STORE(9, 24);
+            STORE(10, 32);
+            STORE(11, 25);
+            STORE(12, 18);
+            STORE(13, 11);
+            STORE(14, 4);
+            STORE(15, 5);
+        }
+#undef STORE
+    }
+};
+
+template <const int dct_block, const int block_oh, const int block_ow,
+          typename T2>
+struct StoreMask<dct_block, block_oh, block_ow, DctLayoutFormat::NCHW4,
+                 MaskType::FIX_32_MASK, int8_t, T2> {
+    static inline __device__ void func(
+            const float (&thread_cache)[dct_block][dct_block], int8_t* dst_tid,
+            const int oc_stride, int channel_idx, const int* mask_offset,
+            const int* mask_val, CudaPostProcess<T2>& quant_param,
+            megcore::AsyncErrorInfo* error_info, void* error_tracker) {
+#define STORE(store_index, index0, index1, index2, index3)                 \
+    *(char4*)(&dst_tid[store_index * oc_stride]) = {                       \
+            quant_param.func(                                              \
+                    thread_cache[index0 / dct_block][index0 % dct_block]), \
+            quant_param.func(                                              \
+                    thread_cache[index1 / dct_block][index1 % dct_block]), \
+            quant_param.func(                                              \
+                    thread_cache[index2 / dct_block][index2 % dct_block]), \
+            quant_param.func(                                              \
+                    thread_cache[index3 / dct_block][index3 % dct_block])}
+
+        STORE(0, 0, 1, 8, 16);
+        STORE(4, 9, 2, 3, 10);
+        if (channel_idx == 0) {
+            STORE(8, 17, 24, 32, 25);
+            STORE(12, 18, 11, 4, 5);
+        }
+#undef STORE
+    }
+};
+
+template <const int dct_block, MaskType mask_type, const int ker_block_h,
+          const int ker_block_w, uint32_t format, typename DstDtype,
+          typename T2>
+__global__ void kern_dct(const uint8_t* src, DstDtype* dst, const int n,
+                         const int c, const int h, const int w, const int oh,
+                         const int ow, const int oc_stride, const int oc,
+                         const int* mask_offset, const int* mask_val,
+                         CudaPostProcess<T2> quant_param,
+                         megcore::AsyncErrorInfo* error_info,
+                         void* error_tracker) {
+    constexpr int block_oh = ker_block_h / dct_block;
+    constexpr int block_ow = ker_block_w / dct_block;
+    const int channel_stride = h * w;
+    const int oc_idx = blockIdx.z % c;
+    const int oh_idx = blockIdx.y * block_oh + threadIdx.y;
+    const int ow_idx = blockIdx.x * block_ow + threadIdx.x;
+    float thread_cache[dct_block][dct_block];
+    const uint8_t* src_tid =
+            src + blockIdx.z * channel_stride +
+            (blockIdx.y * ker_block_h + threadIdx.y * dct_block) * w +
+            (blockIdx.x * ker_block_w + threadIdx.x * dct_block);
+    const int inner_channel_offset =
+            (oh_idx * ow + ow_idx) * ChannelBlockHelper<format>::channel_block;
+
+    DstDtype* dst_tid =
+            dst + blockIdx.z * channel_stride + inner_channel_offset;
+    if (mask_type != MaskType::NO_MASK) {
+        const int batch_idx = blockIdx.z / c;
+        const int batch_stride = oc_stride * oc;
+        int out_channel_offset = 0;
+        if (mask_type == MaskType::FIX_32_MASK) {
+            //! trick out_channel_offset = {0, 16, 24}[oc_idx]; oc_idx = 0, 1, 2
+            out_channel_offset = 16 * oc_idx - 8 * (oc_idx >> 1);
+        } else {
+            out_channel_offset = mask_offset[oc_idx];
+        }
+        dst_tid = dst + batch_idx * batch_stride +
+                  out_channel_offset * oc_stride + inner_channel_offset;
+    }
+
+    if (oh_idx < oh && ow_idx < ow) {
+        load_row(thread_cache[0], src_tid + 0 * w);
+        load_row(thread_cache[1], src_tid + 1 * w);
+        load_row(thread_cache[2], src_tid + 2 * w);
+        load_row(thread_cache[3], src_tid + 3 * w);
+        load_row(thread_cache[4], src_tid + 4 * w);
+        load_row(thread_cache[5], src_tid + 5 * w);
+        load_row(thread_cache[6], src_tid + 6 * w);
+        load_row(thread_cache[7], src_tid + 7 * w);
+
+        //! TMP = A @ C.T
+        fast_dct_1d(thread_cache[0]);
+        fast_dct_1d(thread_cache[1]);
+        fast_dct_1d(thread_cache[2]);
+        fast_dct_1d(thread_cache[3]);
+        fast_dct_1d(thread_cache[4]);
+        fast_dct_1d(thread_cache[5]);
+        fast_dct_1d(thread_cache[6]);
+        fast_dct_1d(thread_cache[7]);
+
+        //! TMP = C @ TMP
+        fast_dct_1d_col(thread_cache, 0);
+        fast_dct_1d_col(thread_cache, 1);
+        fast_dct_1d_col(thread_cache, 2);
+        fast_dct_1d_col(thread_cache, 3);
+        fast_dct_1d_col(thread_cache, 4);
+        fast_dct_1d_col(thread_cache, 5);
+        fast_dct_1d_col(thread_cache, 6);
+        fast_dct_1d_col(thread_cache, 7);
+
+        StoreMask<dct_block, block_oh, block_ow, format, mask_type, DstDtype,
+                  T2>::func(thread_cache, dst_tid, oc_stride, oc_idx,
+                            mask_offset, mask_val, quant_param, error_info,
+                            error_tracker);
+    }
+}
+
+}  // namespace
+
+template <int dct_block, uint32_t format, typename DstDtype>
+void call_kern_dct(const uint8_t* d_src, DstDtype* d_dst, const int n,
+                   const int c, const int h, const int w, const int oc,
+                   bool fix_32_mask, const int* mask_offset,
+                   const int* mask_val, cudaStream_t stream,
+                   megcore::AsyncErrorInfo* error_info, void* error_tracker,
+                   float scale) {
+    constexpr int ker_block_h = 32;
+    constexpr int ker_block_w = 256;
+    const int oh = h / dct_block;
+    const int ow = w / dct_block;
+    const int oc_stride = oh * ow;
+    const dim3 block_dim(DIVUP(w, ker_block_w), DIVUP(h, ker_block_h), n * c);
+    const dim3 thread_dim(DIVUP(ker_block_w, dct_block),
+                          DIVUP(ker_block_h, dct_block));
+    auto cuda_dtype_param = CudaPostProcess<DstDtype>(scale);
+    if (fix_32_mask) {
+        kern_dct<dct_block, MaskType::FIX_32_MASK, ker_block_h, ker_block_w,
+                 format><<<block_dim, thread_dim, 0, stream>>>(
+                d_src, d_dst, n, c, h, w, oh, ow, oc_stride, oc, mask_offset,
+                mask_val, cuda_dtype_param, error_info, error_tracker);
+    } else if (mask_offset && mask_val) {
+        kern_dct<dct_block, MaskType::USER_DEFINE_MASK, ker_block_h,
+                 ker_block_w, format><<<block_dim, thread_dim, 0, stream>>>(
+                d_src, d_dst, n, c, h, w, oh, ow, oc_stride, oc, mask_offset,
+                mask_val, cuda_dtype_param, error_info, error_tracker);
+    } else {
+        kern_dct<dct_block, MaskType::NO_MASK, ker_block_h, ker_block_w, format>
+                <<<block_dim, thread_dim, 0, stream>>>(
+                        d_src, d_dst, n, c, h, w, oh, ow, oc_stride, oc,
+                        mask_offset, mask_val, cuda_dtype_param, error_info,
+                        error_tracker);
+    }
+}
+
+template void call_kern_dct<8, DctLayoutFormat::NCHW, float>(
+        const uint8_t* d_src, float* d_dst, const int n, const int c,
+        const int h, const int w, const int oc, bool fix_32_mask,
+        const int* mask_offset, const int* mask_val, cudaStream_t stream,
+        megcore::AsyncErrorInfo* error_info, void* error_tracker, float scale);
+
+template void call_kern_dct<8, DctLayoutFormat::NCHW4, int8_t>(
+        const uint8_t* d_src, int8_t* d_dst, const int n, const int c,
+        const int h, const int w, const int oc, bool fix_32_mask,
+        const int* mask_offset, const int* mask_val, cudaStream_t stream,
+        megcore::AsyncErrorInfo* error_info, void* error_tracker, float scale);
+
+}  // namespace dct
+
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
\ No newline at end of file
--- a/dnn/src/cuda/dct/dct_channel_select.cuh
+++ b/dnn/src/cuda/dct/dct_channel_select.cuh
+/**
+ * \file dnn/src/cuda/dct/dct_channel_select.cuh
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the
+ "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express
+ or
+ * implied.
+ */
+#pragma once
+#include <stdint.h>
+#include <cstdio>
+#include "src/common/opr_param_defs_enumv.cuh"
+#include "src/cuda/utils.cuh"
+
+namespace megdnn {
+namespace cuda {
+namespace dct {
+
+using DctLayoutFormat = megdnn::param_enumv::DctChannelSelect::Format;
+
+template <int dct_block, uint32_t format, typename DstDtype>
+void call_kern_dct(const uint8_t* d_src, DstDtype* d_dst, const int n,
+                   const int c, const int h, const int w, const int oc,
+                   bool fix_32_mask, const int* mask_offset,
+                   const int* mask_val, cudaStream_t stream,
+                   megcore::AsyncErrorInfo* error_info, void* error_tracker,
+                   float scale = 1.f);
+
+}  // namespace dct
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
\ No newline at end of file
--- a/dnn/src/cuda/dct/opr_impl.cpp
+++ b/dnn/src/cuda/dct/opr_impl.cpp
+/**
+ * \file dnn/src/naive/dct/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "src/common/utils.h"
+#include "src/cuda/dct/dct_channel_select.cuh"
+#include "src/cuda/dct/opr_impl.h"
+#include "src/cuda/handle.h"
+#include "src/cuda/utils.h"
+namespace megdnn {
+namespace cuda {
+
+void DctChannelSelectForwardImpl::exec(_megdnn_tensor_in src,
+                                       _megdnn_tensor_in mask_offset,
+                                       _megdnn_tensor_in mask_val,
+                                       _megdnn_tensor_out dst,
+                                       _megdnn_workspace /*workspace*/) {
+    auto stream = cuda_stream(this->handle());
+    const int in = src.layout.shape[0];
+    const int ic = src.layout.shape[1];
+    const int ih = src.layout.shape[2];
+    const int iw = src.layout.shape[3];
+    int oc = dst.layout.shape[1];
+    const bool with_fix_32_mask =
+            param().fastImpl == Param::FastImpl::FIX_32_MASK;
+    if (param().format == Param::Format::NCHW4) {
+        megdnn_assert(dst.layout.ndim == 5 && dst.layout.shape[4] == 4,
+                      "dst must be nchw4");
+        oc = oc * 4;
+    }
+    megdnn_assert(!with_fix_32_mask || (with_fix_32_mask && oc == 32),
+                  "only support specify mask");
+    megdnn_assert(param().dct_block_size == 8, "only support dct block = 8");
+    auto error_info =
+            concrete_handle(this->handle())->megcore_context().error_info;
+    constexpr int dct_block = 8;
+    const int* mask_offset_ptr = nullptr;
+    const int* mask_val_ptr = nullptr;
+    if (mask_offset.layout.ndim == 1 && mask_offset.layout.shape[0] >= 2) {
+        mask_offset_ptr = mask_offset.ptr<int32_t>();
+        mask_val_ptr = mask_val.ptr<int32_t>();
+    }
+    if (dst.layout.dtype.enumv() == DTypeEnum::Float32) {
+        megdnn_assert(param().format == Param::Format::NCHW,
+                      "fp32 only support nchw");
+        dct::call_kern_dct<dct_block, dct::DctLayoutFormat::NCHW>(
+                src.ptr<uint8_t>(), dst.ptr<float>(), in, ic, ih, iw, oc,
+                with_fix_32_mask, mask_offset_ptr, mask_val_ptr, stream,
+                error_info, m_error_tracker);
+    } else {
+        megdnn_assert(dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8,
+                      "only support fp32 and qs8");
+        megdnn_assert(param().format == Param::Format::NCHW4,
+                      "qint8 only support nchw4");
+        dct::call_kern_dct<dct_block, dct::DctLayoutFormat::NCHW4>(
+                src.ptr<uint8_t>(), (int8_t*)dst.raw_ptr, in, ic, ih, iw, oc,
+                with_fix_32_mask, mask_offset_ptr, mask_val_ptr, stream,
+                error_info, m_error_tracker,
+                dst.layout.dtype.param<::megdnn::dtype::QuantizedS8>().scale);
+    }
+}
+
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/dct/opr_impl.h
+++ b/dnn/src/cuda/dct/opr_impl.h
+/**
+ * \file dnn/src/cuda/dct/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace cuda {
+
+class DctChannelSelectForwardImpl : public DctChannelSelectForward {
+public:
+    using DctChannelSelectForward::DctChannelSelectForward;
+    void* m_error_tracker = nullptr;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_in mask_offset,
+              _megdnn_tensor_in mask_val, _megdnn_tensor_out dst,
+              _megdnn_workspace workspace) override;
+
+    size_t get_workspace_in_bytes(const TensorLayout& /*src*/,
+                                  const TensorLayout& /*mask_offset*/,
+                                  const TensorLayout& /*mask_val*/,
+                                  const TensorLayout& /*dst*/) {
+        return 0;
+    };
+    void set_error_tracker(void* tracker) override {
+        m_error_tracker = tracker;
+    }
+};
+
+}  // namespace cuda
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/cuda/handle_create.cpp
+++ b/dnn/src/cuda/handle_create.cpp
@@ -26,6 +26,7 @@
 #include "src/cuda/convpooling/opr_impl.h"
 #include "src/cuda/cumsum/opr_impl.h"
 #include "src/cuda/cvt_color/opr_impl.h"
+#include "src/cuda/dct/opr_impl.h"
 #include "src/cuda/deformable_conv/opr_impl.h"
 #include "src/cuda/deformable_ps_roi_pooling/opr_impl.h"
 #include "src/cuda/dot/opr_impl.h"

--- a/dnn/src/naive/dct/opr_impl.cpp
+++ b/dnn/src/naive/dct/opr_impl.cpp
+/**
+ * \file dnn/src/naive/dct/opr_impl.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include <cmath>
+#include "megdnn/basic_types.h"
+#include "megdnn/dtype.h"
+#include "midout.h"
+#include "src/naive/dct/opr_impl.h"
+#include "src/naive/handle.h"
+#include "src/naive/matrix_mul/matrix_mul_helper.h"
+MIDOUT_DECL(megdnn_naive_dct_fwd)
+namespace megdnn {
+namespace naive {
+
+namespace {
+
+static inline void generate_c_matrix(float* result, int block) {
+    constexpr float pi = M_PI;
+    for (int i = 0; i < block; ++i) {
+        for (int j = 0; j < block; ++j) {
+            float alpha = i == 0 ? sqrt(1.f / static_cast<float>(block))
+                                 : sqrt(2.f / static_cast<float>(block));
+            result[i * block + j] = alpha * cos((2.f * j + 1.f) * i * pi /
+                                                static_cast<float>(2 * block));
+        }
+    }
+}
+
+template <typename T>
+void matmul(int m, int n, int k, int lda, int ldb, int ldc, const float* a,
+            const T* b, float* c, bool trans_a, bool trans_b) {
+    for (int m_idx = 0; m_idx < m; ++m_idx) {
+        for (int n_idx = 0; n_idx < n; ++n_idx) {
+            float res = 0.f;
+            for (int k_idx = 0; k_idx < k; ++k_idx) {
+                float av = trans_a ? a[k_idx * lda + m_idx]
+                                   : a[m_idx * lda + k_idx];
+                float bv = trans_b ? b[n_idx * ldb + k_idx]
+                                   : b[k_idx * ldb + n_idx];
+                res += av * bv;
+            }
+            c[m_idx * ldc + n_idx] = res;
+        }
+    }
+}
+
+std::vector<std::vector<int>> mask_offset_to_2dmask(
+        _megdnn_tensor_in mask_offset, _megdnn_tensor_in mask_val) {
+    std::vector<std::vector<int>> mask;
+    if (mask_offset.layout.ndim > 0 && mask_offset.layout[0] >= 2) {
+        const int offset_len = mask_offset.layout.shape[0];
+        const int32_t* mask_offset_ptr = mask_offset.ptr<int32_t>();
+        const int32_t* mask_val_ptr = mask_val.ptr<int32_t>();
+        megdnn_assert(
+                mask_val.layout.shape[0] ==
+                        static_cast<size_t>(mask_offset_ptr[offset_len - 1]),
+                "check mask offset %zu != %zu", mask_val.layout.shape[0],
+                static_cast<size_t>(mask_offset_ptr[offset_len - 1]));
+
+        for (int offset_idx = 1; offset_idx < offset_len; ++offset_idx) {
+            mask.push_back({});
+            const int mask_len = mask_offset_ptr[offset_idx] -
+                                 mask_offset_ptr[offset_idx - 1];
+            const int32_t* mask_ptr =
+                    &mask_val_ptr[mask_offset_ptr[offset_idx - 1]];
+            for (int val_idx = 0; val_idx < mask_len; ++val_idx) {
+                mask[offset_idx - 1].push_back(mask_ptr[val_idx]);
+            }
+        }
+    }
+    return mask;
+}
+
+inline bool is_layout_nchw4(const TensorLayout& layout) {
+    if (layout.ndim == 5 && layout[4] == 4) {
+        return true;
+    } else {
+        return false;
+    }
+}
+
+template <typename T>
+using QuantizedCType =
+        std::enable_if_t<DTypeTrait<T>::category == DTypeCategory::QUANTIZED,
+                         typename DTypeTrait<T>::ctype>;
+
+inline int8_t quant_float_2_int8(float val, DType dtype) {
+    return dtype.param<::megdnn::dtype::QuantizedS8>().quantize(val).as_int8();
+}
+
+template <param::DctChannelSelect::Format format, typename Dtype>
+inline void dct_output(Dtype* dst_ptr, const int oc_idx, const int img_size,
+                       float val, DType) {
+    dst_ptr[oc_idx * img_size] = val;
+}
+template <>
+inline void dct_output<param::DctChannelSelect::Format::NCHW4>(
+        int8_t* dst_ptr, const int oc_idx, const int img_size, float val,
+        DType dtype) {
+    dst_ptr[oc_idx / 4 * 4 * img_size + oc_idx % 4] =
+            quant_float_2_int8(val, dtype);
+}
+template <param::DctChannelSelect::Format format>
+struct ChannleBlock {
+    static constexpr int block = 1;
+};
+
+template <>
+struct ChannleBlock<param::DctChannelSelect::Format::NCHW4> {
+    static constexpr int block = 4;
+};
+
+template <param::DctChannelSelect::Format format, typename Dtype>
+void naive_dct(const uint8_t* src, Dtype* dst, int n, int c, int h, int w,
+               int block, const std::vector<std::vector<int>>& mask,
+               DType dtype) {
+    constexpr int block_channel = ChannleBlock<format>::block;
+    const int block_h = block;
+    const int block_w = block;
+    std::vector<float> c_matrix(block * block);
+    std::vector<float> tmp(block * block);
+    std::vector<float> tmp_result(block * block);
+    generate_c_matrix(&c_matrix[0], block);
+    megdnn_assert(h % block_h == 0, "h mod block_h == 0");
+    megdnn_assert(w % block_w == 0, "w mod block_w == 0");
+    const int oh = h / block_h;
+    const int ow = w / block_w;
+    const int o_img_size = oh * ow;
+    std::vector<int> mask_offset;
+    int mask_len_sum = 0;
+    if (mask.size() > 0) {
+        for (auto& sub_mask : mask) {
+            mask_offset.push_back(mask_len_sum);
+            mask_len_sum += sub_mask.size();
+        }
+    } else {
+        for (int c_idx = 0; c_idx < c; ++c_idx) {
+            mask_offset.push_back(mask_len_sum);
+            mask_len_sum += block_h * block_w;
+        }
+    }
+    const size_t o_batch_stride = mask_len_sum * oh * ow;
+
+    for (int n_idx = 0; n_idx < n; ++n_idx) {
+        for (int c_idx = 0; c_idx < c; ++c_idx) {
+            megdnn_assert(mask_offset[c_idx] % block_channel == 0,
+                          "%d mod %d == 0", mask_offset[c_idx], block_channel);
+            const size_t src_offset = n_idx * c * h * w + c_idx * h * w;
+            const uint8_t* src_channel = src + src_offset;
+            const size_t dst_offset = n_idx * o_batch_stride +
+                                      mask_offset[c_idx] / block_channel * oh *
+                                              ow * block_channel;
+            Dtype* dst_channel = dst + dst_offset;
+            for (int oh_idx = 0; oh_idx < oh; ++oh_idx) {
+                for (int ow_idx = 0; ow_idx < ow; ++ow_idx) {
+                    matmul(block, block, block, block, w, block, &c_matrix[0],
+                           &src_channel[oh_idx * block_h * w +
+                                        ow_idx * block_w],
+                           &tmp[0], false, false);
+                    matmul(block, block, block, block, block, block, &tmp[0],
+                           &c_matrix[0], &tmp_result[0], false, true);
+                    Dtype* dst_start = dst_channel +
+                                       (oh_idx * ow + ow_idx) * block_channel;
+                    if (mask.size() == 0) {
+                        for (int inner_h_idx = 0; inner_h_idx < block_h;
+                             ++inner_h_idx) {
+                            for (int inner_w_idx = 0; inner_w_idx < block_w;
+                                 ++inner_w_idx) {
+                                const int oc_idx =
+                                        inner_h_idx * block_w + inner_w_idx;
+                                dct_output<format>(
+                                        dst_start, oc_idx, o_img_size,
+                                        tmp_result[inner_h_idx * block +
+                                                   inner_w_idx],
+                                        dtype);
+                            }
+                        }
+                    } else {
+                        //! with mask
+                        auto& sub_mask = mask[c_idx];
+                        int dst_offset = 0;
+                        for (auto mask_idx : sub_mask) {
+                            dct_output<format>(dst_start, dst_offset,
+                                               o_img_size, tmp_result[mask_idx],
+                                               dtype);
+                            ++dst_offset;
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+}  // namespace
+
+void DctChannelSelectForwardImpl::exec(_megdnn_tensor_in src,
+                                       _megdnn_tensor_in mask_offset,
+                                       _megdnn_tensor_in mask_val,
+                                       _megdnn_tensor_out dst,
+                                       _megdnn_workspace /*workspace*/) {
+    MIDOUT_BEGIN(megdnn_naive_dct_fwd) {
+        int in = src.layout.shape[0];
+        int ic = src.layout.shape[1];
+        int ih = src.layout.shape[2];
+        int iw = src.layout.shape[3];
+        megdnn_assert(dst.raw_ptr, "dst can not be nullptr");
+        const int block = param().dct_block_size;
+        auto mask = mask_offset_to_2dmask(mask_offset, mask_val);
+        if (dst.layout.dtype.enumv() == DTypeEnum::Float32) {
+            megdnn_assert(!is_layout_nchw4(dst.layout) &&
+                                  param().format == Param::Format::NCHW,
+                          "dst must be nchw");
+            MEGDNN_DISPATCH_CPU_KERN_OPR(naive_dct<Param::Format::NCHW>(
+                    src.ptr<uint8_t>(), dst.ptr<float>(), in, ic, ih, iw, block,
+                    mask, dst.layout.dtype));
+        } else {
+            megdnn_assert(dst.layout.dtype.enumv() == DTypeEnum::QuantizedS8,
+                          "dst must be q8");
+            megdnn_assert(is_layout_nchw4(dst.layout) &&
+                                  param().format == Param::Format::NCHW4,
+                          "dst must be nchw4");
+            MEGDNN_DISPATCH_CPU_KERN_OPR(naive_dct<Param::Format::NCHW4>(
+                    src.ptr<uint8_t>(), static_cast<int8_t*>(dst.raw_ptr), in,
+                    ic, ih, iw, block, mask, dst.layout.dtype));
+        }
+    }
+    MIDOUT_END();
+}
+
+}  // namespace naive
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/naive/dct/opr_impl.h
+++ b/dnn/src/naive/dct/opr_impl.h
+/**
+ * \file dnn/src/naive/dct/opr_impl.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include "megdnn/oprs.h"
+
+namespace megdnn {
+namespace naive {
+
+class DctChannelSelectForwardImpl : public DctChannelSelectForward {
+public:
+    using DctChannelSelectForward::DctChannelSelectForward;
+    void exec(_megdnn_tensor_in src, _megdnn_tensor_in mask_offset,
+              _megdnn_tensor_in mask_val, _megdnn_tensor_out dst,
+              _megdnn_workspace workspace) override;
+    size_t get_workspace_in_bytes(const TensorLayout& /*src*/,
+                                  const TensorLayout&, const TensorLayout&,
+                                  const TensorLayout&) override {
+        return 0;
+    };
+};
+
+}  // namespace naive
+}  // namespace megdnn
+
+// vim: syntax=cpp.doxygen
--- a/dnn/src/naive/handle.cpp
+++ b/dnn/src/naive/handle.cpp
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */

 #include "src/naive/handle.h"
@@ -29,6 +30,7 @@
 #include "src/naive/convpooling/opr_impl.h"
 #include "src/naive/cumsum/opr_impl.h"
 #include "src/naive/cvt_color/opr_impl.h"
+#include "src/naive/dct/opr_impl.h"
 #include "src/naive/deformable_conv/opr_impl.h"
 #include "src/naive/deformable_ps_roi_pooling/opr_impl.h"
 #include "src/naive/dot/opr_impl.h"
@@ -56,6 +58,7 @@
 #include "src/naive/reduce/opr_impl.h"
 #include "src/naive/relayout/opr_impl.h"
 #include "src/naive/relayout_format/opr_impl.h"
+#include "src/naive/remap/opr_impl.h"
 #include "src/naive/repeat/opr_impl.h"
 #include "src/naive/resize/opr_impl.h"
 #include "src/naive/rng/opr_impl.h"
@@ -76,7 +79,6 @@
 #include "src/naive/warp_affine/opr_impl.h"
 #include "src/naive/warp_perspective/opr_impl.h"
 #include "src/naive/winograd_filter_preprocess/opr_impl.h"
-#include "src/naive/remap/opr_impl.h"

 static size_t g_image2d_pitch_alignment = 1;


--- a/dnn/test/common/benchmarker.h
+++ b/dnn/test/common/benchmarker.h
@@ -6,20 +6,21 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
 */
 #pragma once

 #include <map>
 #include <memory>
-#include <vector>
 #include <regex>
+#include <vector>
 #include "megdnn/basic_types.h"
 #include "megdnn/tensor_format.h"
+#include "test/common/opr_algo_proxy.h"
 #include "test/common/opr_proxy.h"
 #include "test/common/rng.h"
 #include "test/common/timer.h"
-#include "test/common/opr_algo_proxy.h"

 namespace megdnn {
 namespace test {
@@ -31,6 +32,7 @@ public:
    using TensorValueArray = TensorNDArray;
    using BeforeExecCallback =
            std::function<void(Opr*, const TensorValueArray&)>;
+    using TensorsConstriant = std::function<void(TensorValueArray& tensors)>;

    BenchmarkerBase(Handle* handle, T timer)
            : m_timer(timer),
@@ -54,6 +56,8 @@ public:
    }
    float exec(TensorLayoutArray layouts);

+    float exect(const TensorValueArray& testcase_in);
+
    //! disabiguate overloaded exec
    float execs(const TensorShapeArray& shapes) { return exec(shapes); }
    float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
@@ -73,6 +77,11 @@ public:
        m_fmt[idx] = fmt;
        return *this;
    }
+    BenchmarkerBase& set_tensors_constraint(
+            const TensorsConstriant& tensor_constraint) {
+        m_tensor_constraint = tensor_constraint;
+        return *this;
+    }
    TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
        TensorLayoutArray layouts(shapes.size());
        for (size_t i = 0; i < shapes.size(); ++i) {
@@ -142,6 +151,7 @@ private:
    std::unique_ptr<OprProxy<Opr>> m_proxy;
    BeforeExecCallback m_before_exec_callback;
    std::unique_ptr<Opr> m_opr;
+    TensorsConstriant m_tensor_constraint;
 };

 template <typename Opr, typename T>
@@ -184,10 +194,16 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
        auto rng = m_rng[i];
        if (!rng)
            rng = m_default_rng.get();
-        auto size = tensor.layout.span().high_byte;
        rng->gen(tensor);
+    }
+    if (m_tensor_constraint) {
+        m_tensor_constraint(tensors_cur_host);
+    }
+    for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
+        TensorND& tensor = tensors_cur_host[i];
        if (tensor.layout.ndim == 0)
            continue;
+        auto size = tensor.layout.span().high_byte;
        megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
                          size);
    }
@@ -243,6 +259,105 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
    return time_in_ms;
 }

+template <typename Opr, typename T>
+float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
+    auto opr = this->opr();
+    opr->param() = m_param;
+    TensorLayoutArray layouts;
+    TensorNDArray tensors_cur_host;
+    for (auto& inp : testcase_in) {
+        layouts.push_back(inp.layout);
+        tensors_cur_host.emplace_back(inp);
+    }
+    auto user_layouts = layouts;
+    m_proxy->deduce_layout(opr, layouts);
+    for (size_t i = 0; i < layouts.size(); ++i)
+        if (user_layouts[i].ndim > 0) {
+            auto run = [&]() {
+                ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
+                        << "User provided shape is "
+                        << user_layouts[i].TensorShape::to_string()
+                        << "\nExpected shape is "
+                        << layouts[i].TensorShape::to_string();
+            };
+            run();
+        }
+    auto allocate = [&layouts](Handle* handle) {
+        TensorNDArray tensors(layouts.size());
+        auto trans_func = [handle](const TensorLayout& layout) {
+            auto span = layout.span();
+            TensorND res;
+            res.raw_ptr = static_cast<uint8_t*>(
+                                  megdnn_malloc(handle, span.dist_byte())) +
+                          span.low_byte;
+            res.layout = layout;
+            return res;
+        };
+        std::transform(layouts.begin(), layouts.end(), tensors.begin(),
+                       trans_func);
+        return tensors;
+    };
+    auto tensors_cur = allocate(m_handle);
+    //! init
+    for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
+        TensorND& tensor = tensors_cur_host[i];
+        auto size = tensor.layout.span().high_byte;
+        if (tensor.layout.ndim == 0)
+            continue;
+        megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
+                          size);
+    }
+    if (m_before_exec_callback) {
+        m_before_exec_callback(opr, tensors_cur);
+    }
+    //! run
+    //! warm up
+    m_proxy->exec(opr, tensors_cur);
+    megcoreSynchronize(m_handle->megcore_computing_handle());
+
+    if (m_adaptive_secs) {
+        //! find m_times for adaptive benchmarking
+        m_times = 0;
+        int cur_times = 1;
+        auto remain_time = m_adaptive_secs * 1e6;
+        while (remain_time > 0) {
+            m_timer.reset();
+            m_timer.start();
+            for (int i = 0; i < cur_times; ++i)
+                m_proxy->exec(opr, tensors_cur);
+            megcoreSynchronize(m_handle->megcore_computing_handle());
+            m_timer.stop();
+            m_times += cur_times;
+            auto this_run_time = m_timer.get_time_in_us();
+            remain_time -= this_run_time;
+            cur_times = std::min(
+                    cur_times * 2,
+                    std::max<int>(1, remain_time / this_run_time * cur_times));
+        }
+    }
+    m_timer.reset();
+    m_timer.start();
+    for (size_t t = 0; t < m_times; ++t)
+        m_proxy->exec(opr, tensors_cur);
+    megcoreSynchronize(m_handle->megcore_computing_handle());
+    m_timer.stop();
+    auto time_in_ms = m_timer.get_time_in_us() / 1e3;
+    if (m_display) {
+        std::cout << "Total time is " << time_in_ms << "ms "
+                  << "for " << m_times << " run(s)." << std::endl;
+    }
+    auto free = [](Handle* handle, TensorNDArray& tensors) {
+        std::for_each(tensors.begin(), tensors.end(),
+                      [handle](const TensorND& tensor) {
+                          megdnn_free(handle, tensor.raw_ptr);
+                      });
+    };
+    free(m_handle, tensors_cur);
+    if (m_adaptive_secs)
+        time_in_ms /= m_times;
+    return time_in_ms;
+}
+
 template <typename Opr, typename T = Timer>
 class Benchmarker;


--- a/dnn/test/common/checker.cpp
+++ b/dnn/test/common/checker.cpp
@@ -294,8 +294,6 @@ void CheckerHelper::do_exec_with_testcases(const TensorValueArray& testcase_in,
        ASSERT_TRUE(testcase_in[i].layout.ndim == 0 ||
                    testcase_out[i].layout.ndim == 0 ||
                    testcase_in[i].layout.eq_layout(testcase_out[i].layout));
-        ASSERT_TRUE(testcase_in[i].layout.ndim != 0 ||
-                    testcase_out[i].layout.ndim != 0);
        layouts.emplace_back(testcase_in[i].layout.ndim > 0
                                     ? testcase_in[i].layout
                                     : testcase_out[i].layout);

--- a/dnn/test/common/checker.h
+++ b/dnn/test/common/checker.h
@@ -392,7 +392,8 @@ TensorND TensorValue(const TensorShape& shape, T dtype,
    tensor.layout = {shape, dtype};
    tensor.raw_ptr =
            static_cast<dt_byte*>(malloc(tensor.layout.span().dist_byte()));
-    megdnn_assert(values.size() == tensor.layout.total_nr_elems());
+    megdnn_assert(values.size() == tensor.layout.total_nr_elems(), "%zu == %zu", values.size(), 
+            tensor.layout.total_nr_elems());
    auto ptr = tensor.ptr<typename DTypeTrait<T>::ctype>();
    for (const auto& v : values) {
        *ptr++ = typename DTypeTrait<T>::ctype(v);

--- a/dnn/test/common/dct_ref.cpp
+++ b/dnn/test/common/dct_ref.cpp
+/**
+ * \file
+ * dnn/test/common/dct_ref.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "test/common/dct_ref.h"
+namespace megdnn {
+namespace test {
+struct FixCase {
+    std::vector<int> mask_offset;
+    std::vector<int> mask_val;
+};
+using Param = DctChannelSelectForward::Param;
+
+static inline FixCase get_fix_mask(Param::FastImpl impl) {
+    std::vector<int> fix_32_mask_offset{0, 16, 24, 32};
+    std::vector<int> fix_32_mask_val{0,  1,  8,  16, 9, 2,  3, 10, 17, 24, 32,
+                                     25, 18, 11, 4,  5, 0,  1, 8,  16, 9,  2,
+                                     3,  10, 0,  1,  8, 16, 9, 2,  3,  10};
+    megdnn_assert(impl == Param::FastImpl::FIX_32_MASK,
+                  "only support gen FIX_32_MASK");
+    return {fix_32_mask_offset, fix_32_mask_val};
+}
+
+CheckerHelper::TensorsConstriant gen_dct_constriant(
+        const size_t /* n */, const size_t ic, const size_t ih, const size_t iw,
+        const size_t oc, Param param) {
+    auto constraint = [=](CheckerHelper::TensorValueArray& tensors_orig) {
+        const size_t block = param.dct_block_size;
+        const int block_c = param.format == Param::Format::NCHW4 ? 4 : 1;
+        megdnn_assert(oc % block_c == 0, "oc mod block_c must == 0");
+        std::shared_ptr<DctTestcase> test_case_ptr = DctTestcase::make();
+        DctTestcase& test_case = *test_case_ptr.get();
+        UniformIntRNG rng(0, 255);
+        UniformIntRNG mask_rng(0, 64 / block_c - 1);
+        const size_t no_mask_oc = ic * block * block;
+        megdnn_assert(ih % block == 0, "%zu mod %zu == 0", ih, block);
+        megdnn_assert(iw % block == 0, "%zu mod %zu == 0", iw, block);
+
+        TensorND mask_offset;
+        TensorND mask_val;
+        std::vector<int>& mask_offset_vec = test_case.mask_offset_vec;
+        std::vector<int>& mask_val_vec = test_case.mask_val_vec;
+        UniformIntRNG rng_oc(0, oc);
+        if (param.fastImpl == Param::FastImpl::FIX_32_MASK) {
+            auto fix_32_mask = get_fix_mask(Param::FastImpl::FIX_32_MASK);
+            mask_offset_vec = fix_32_mask.mask_offset;
+            mask_val_vec = fix_32_mask.mask_val;
+            megdnn_assert(oc == 32, "oc must eq 32");
+        } else if (no_mask_oc > oc) {
+            size_t remain_oc = oc;
+            mask_offset_vec.resize(ic + 1);
+            mask_val_vec.resize(oc);
+            mask_offset_vec[0] = 0;
+            for (size_t ic_idx = 0; ic_idx < ic; ++ic_idx) {
+                size_t random_len = (int)rng_oc.gen_single_val() * block_c;
+                size_t mask_len = (ic_idx == ic - 1) || (remain_oc == 0)
+                                          ? remain_oc
+                                          : random_len % remain_oc;
+                megdnn_assert(mask_len % block_c == 0,
+                              "mask_len mod block_c == 0, but %zu mod %d ",
+                              mask_len, block_c);
+                const size_t oc_idx = mask_offset_vec[ic_idx];
+                remain_oc -= mask_len;
+                mask_offset_vec[ic_idx + 1] = oc_idx + mask_len;
+                for (size_t mask_idx = 0; mask_idx < mask_len; ++mask_idx) {
+                    mask_val_vec[oc_idx + mask_idx] =
+                            (int)mask_rng.gen_single_val();
+                }
+            }
+        }
+        mask_offset = TensorND(mask_offset_vec.data(),
+                               {{mask_offset_vec.size()}, dtype::Int32()});
+        mask_val = TensorND(mask_val_vec.data(),
+                            {{mask_val_vec.size()}, dtype::Int32()});
+        if (tensors_orig.size() > 1) {
+            megdnn_assert(tensors_orig.size() == 4, "tensors_orig.size() == 4");
+            megdnn_assert(mask_offset_vec.size() >= 2,
+                          "mask_offset_vec.size() >= 2");
+            megdnn_assert(tensors_orig[1].layout == mask_offset.layout,
+                          "tensors_orig[1].layout == mask_offset.layout");
+            megdnn_assert(tensors_orig[2].layout == mask_val.layout,
+                          "tensors_orig[2].layout == mask_val.layout");
+            auto naive_handle = create_cpu_handle(2, false);
+            megdnn_memcpy_D2D(naive_handle.get(), tensors_orig[1].raw_ptr,
+                              mask_offset.raw_ptr,
+                              mask_offset.layout.span().dist_byte());
+            megdnn_memcpy_D2D(naive_handle.get(), tensors_orig[2].raw_ptr,
+                              mask_val.raw_ptr,
+                              mask_val.layout.span().dist_byte());
+        }
+    };
+    return constraint;
+}
+
+std::shared_ptr<DctTestcase> gen_dct_case(const size_t n, const size_t ic,
+                                          const size_t ih, const size_t iw,
+                                          const size_t oc, Param param,
+                                          DType dst_dtype,
+                                          bool correct_result) {
+    const size_t block = param.dct_block_size;
+    const int block_c = param.format == Param::Format::NCHW4 ? 4 : 1;
+    megdnn_assert(oc % block_c == 0, "oc mod block_c must == 0");
+    std::shared_ptr<DctTestcase> test_case_ptr = DctTestcase::make();
+    DctTestcase& test_case = *test_case_ptr.get();
+    UniformIntRNG rng(0, 255);
+    UniformIntRNG mask_rng(0, 64 / block_c - 1);
+    const size_t input_elements = n * ic * ih * iw;
+    const size_t no_mask_oc = ic * block * block;
+    megdnn_assert(ih % block == 0, "%zu mod %zu == 0", ih, block);
+    megdnn_assert(iw % block == 0, "%zu mod %zu == 0", iw, block);
+    std::vector<uint8_t>& inp_vec = test_case.inp_vec;
+    inp_vec.resize(input_elements);
+    TensorShape input_shape{n, ic, ih, iw};
+    for (auto& elm : inp_vec) {
+        elm = (uint8_t)rng.gen_single_val();
+    }
+    auto src = TensorND(inp_vec.data(), {input_shape, dtype::Uint8()});
+    TensorND mask_offset;
+    TensorND mask_val;
+    std::vector<int>& mask_offset_vec = test_case.mask_offset_vec;
+    std::vector<int>& mask_val_vec = test_case.mask_val_vec;
+    UniformIntRNG rng_oc(0, oc);
+    if (param.fastImpl == Param::FastImpl::FIX_32_MASK) {
+        auto fix_32_mask = get_fix_mask(Param::FastImpl::FIX_32_MASK);
+        mask_offset_vec = fix_32_mask.mask_offset;
+        mask_val_vec = fix_32_mask.mask_val;
+        megdnn_assert(oc == 32, "oc must eq 32");
+    } else if (no_mask_oc > oc) {
+        size_t remain_oc = oc;
+        mask_offset_vec.resize(ic + 1);
+        mask_val_vec.resize(oc);
+        mask_offset_vec[0] = 0;
+        for (size_t ic_idx = 0; ic_idx < ic; ++ic_idx) {
+            size_t random_len = (int)rng_oc.gen_single_val() * block_c;
+            size_t mask_len = (ic_idx == ic - 1) || (remain_oc == 0)
+                                      ? remain_oc
+                                      : random_len % remain_oc;
+            megdnn_assert(mask_len % block_c == 0,
+                          "mask_len mod block_c == 0, but %zu mod %d ",
+                          mask_len, block_c);
+            const size_t oc_idx = mask_offset_vec[ic_idx];
+            remain_oc -= mask_len;
+            mask_offset_vec[ic_idx + 1] = oc_idx + mask_len;
+            for (size_t mask_idx = 0; mask_idx < mask_len; ++mask_idx) {
+                mask_val_vec[oc_idx + mask_idx] =
+                        (int)mask_rng.gen_single_val();
+            }
+        }
+    }
+    mask_offset = TensorND(mask_offset_vec.data(),
+                           {{mask_offset_vec.size()}, dtype::Int32()});
+    mask_val = TensorND(mask_val_vec.data(),
+                        {{mask_val_vec.size()}, dtype::Int32()});
+    if (mask_offset_vec.size() >= 2) {
+        test_case.testcase_in = {
+                src, mask_offset, mask_val, {nullptr, {{}, dst_dtype}}};
+    } else {
+        test_case.testcase_in = {src, {}, {}, {nullptr, {{}, dst_dtype}}};
+    }
+
+    auto naive_handle = create_cpu_handle(2, false);
+    auto opr_naive = naive_handle->create_operator<DctChannelSelectForward>();
+    opr_naive->param() = param;
+    using Proxy = OprProxy<DctChannelSelectForward>;
+    Proxy naive_proxy;
+    TensorLayout temp_dst_layout;
+    temp_dst_layout.dtype = dst_dtype;
+
+    TensorLayoutArray layouts{src.layout, mask_offset.layout, mask_val.layout,
+                              temp_dst_layout};
+    naive_proxy.deduce_layout(opr_naive.get(), layouts);
+    const size_t output_elements = layouts[3].total_nr_elems();
+    std::vector<float>& output_vec = test_case.output_vec;
+    output_vec.resize(output_elements);
+    auto dst = TensorND(output_vec.data(), layouts[3]);
+    DctTestcase::TensorValueArray testcase_naive;
+    testcase_naive.emplace_back(test_case.testcase_in[0]);
+    testcase_naive.emplace_back(test_case.testcase_in[1]);
+    testcase_naive.emplace_back(test_case.testcase_in[2]);
+    testcase_naive.emplace_back(dst);
+    if (correct_result) {
+        naive_proxy.exec(opr_naive.get(), testcase_naive);
+    }
+    test_case.testcase_out = {{}, {}, {}, dst};
+    return test_case_ptr;
+}
+
+}  // namespace test
+}  // namespace megdnn
+   // vim: syntax=cpp.doxygen
\ No newline at end of file
--- a/dnn/test/common/dct_ref.h
+++ b/dnn/test/common/dct_ref.h
+/**
+ * \file
+ * dnn/test/common/dct_ref.h
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#pragma once
+#include <math.h>
+#include <vector>
+#include "megdnn/dtype.h"
+#include "megdnn/oprs/nn.h"
+#include "test/common/checker.h"
+#include "test/common/opr_proxy.h"
+#include "test/common/rng.h"
+
+namespace megdnn {
+namespace test {
+
+using Param = DctChannelSelectForward::Param;
+
+struct DctTestcase {
+    using TensorValueArray = TensorNDArray;
+    TensorValueArray testcase_in;
+    TensorValueArray testcase_out;
+    std::vector<uint8_t> inp_vec;
+    std::vector<int> mask_offset_vec;
+    std::vector<int> mask_val_vec;
+    std::vector<float> output_vec;
+    static std::shared_ptr<DctTestcase> make() {
+        return std::make_shared<DctTestcase>();
+    }
+};
+
+CheckerHelper::TensorsConstriant gen_dct_constriant(
+        const size_t n, const size_t ic, const size_t ih, const size_t iw,
+        const size_t oc, Param param);
+
+std::shared_ptr<DctTestcase> gen_dct_case(const size_t n, const size_t ic,
+                                          const size_t ih, const size_t iw,
+                                          const size_t oc, Param param,
+                                          DType dst_dtype = dtype::Float32(),
+                                          bool correct_result = true);
+
+}  // namespace test
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
--- a/dnn/test/common/opr_trait.h
+++ b/dnn/test/common/opr_trait.h
@@ -110,6 +110,7 @@ DEF(BatchConvBiasForward, 5, true, true);
 DEF(Remap, 3, true, true);
 DEF(RemapBackwardData, 3, true, false);
 DEF(RemapBackwardMat, 4, true, false);
+DEF(DctChannelSelectForward, 4, true, true);
 }  // namespace test
 }  // namespace megdnn


--- a/dnn/test/cuda/dct.cpp
+++ b/dnn/test/cuda/dct.cpp
+/**
+ * \file dnn/test/cuda/dct.cpp
+ * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
+ *
+ * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
+ */
+#include "megdnn/oprs/nn.h"
+#include "test/common/benchmarker.h"
+#include "test/common/checker.h"
+#include "test/common/dct_ref.h"
+#include "test/common/rng.h"
+#include "test/cuda/fixture.h"
+
+namespace megdnn {
+namespace test {
+
+TEST_F(CUDA, DCT) {
+    DctChannelSelectForward::Param param;
+    Checker<DctChannelSelectForward> checker(handle_cuda());
+    for (size_t n : {1, 3}) {
+        for (size_t ic : {1, 3}) {
+            for (size_t ih : {8, 16, 32, 512, 1024}) {
+                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
+                    checker.set_param(param)
+                            .set_dtype(0, dtype::Uint8())
+                            .set_dtype(1, dtype::Int32())
+                            .set_dtype(2, dtype::Int32())
+                            .execs({TensorShape{n, ic, ih, iw}, {}, {}, {}});
+                }
+            }
+        }
+    }
+}
+
+TEST_F(CUDA, DCT_QINT8) {
+    DctChannelSelectForward::Param param;
+    Checker<DctChannelSelectForward> checker(handle_cuda());
+    param.format = Param::Format::NCHW4;
+    for (size_t n : {1, 3}) {
+        for (size_t ic : {1, 3}) {
+            for (size_t ih : {8, 16, 32, 512, 1024}) {
+                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
+                    checker.set_param(param)
+                            .set_dtype(0, dtype::Uint8())
+                            .set_dtype(1, dtype::Int32())
+                            .set_dtype(2, dtype::Int32())
+                            .set_dtype(3, dtype::QuantizedS8(10.f))
+                            .set_epsilon(1)
+                            .execs({TensorShape{n, ic, ih, iw}, {}, {}, {}});
+                }
+            }
+        }
+    }
+}
+
+TEST_F(CUDA, DCT_WITH_FIX_32_MASK) {
+    using Param = DctChannelSelectForward::Param;
+    Param param;
+    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
+    param.fastImpl = Param::FastImpl::FIX_32_MASK;
+    auto test_case = gen_dct_case(3, 3, 1024, 768, 32, param);
+    checker.set_param(param).exect(test_case->testcase_in,
+                                   test_case->testcase_out);
+}
+
+TEST_F(CUDA, DCT_WITH_FIX_32_MASK_QINT8) {
+    using Param = DctChannelSelectForward::Param;
+    Param param;
+    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
+    param.fastImpl = Param::FastImpl::FIX_32_MASK;
+    param.format = Param::Format::NCHW4;
+    auto test_case =
+            gen_dct_case(3, 3, 1024, 768, 32, param, dtype::QuantizedS8(10.f));
+    checker.set_param(param).set_epsilon(1).exect(test_case->testcase_in,
+                                                  test_case->testcase_out);
+}
+
+TEST_F(CUDA, DCT_WITH_MASK) {
+    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
+    DctChannelSelectForward::Param param;
+    checker.set_param(param).exect(
+            Testcase{TensorValue(
+                             {1, 3, 8, 16}, dtype::Uint8(),
+                             {109, 39,  30,  115, 71,  15,  206, 139, 221, 5,
+                              18,  16,  93,  185, 99,  102, 205, 172, 191, 29,
+                              185, 6,   47,  84,  0,   47,  105, 203, 251, 73,
+                              196, 83,  3,   211, 32,  181, 49,  111, 114, 83,
+                              148, 232, 77,  17,  35,  2,   154, 100, 41,  135,
+                              141, 206, 56,  91,  137, 199, 104, 192, 75,  122,
+                              78,  65,  184, 69,  91,  82,  2,   172, 194, 240,
+                              49,  145, 87,  210, 97,  190, 179, 93,  125, 105,
+                              181, 207, 148, 178, 133, 53,  25,  198, 238, 151,
+                              14,  120, 213, 195, 145, 20,  122, 107, 217, 185,
+                              65,  5,   115, 110, 82,  206, 163, 86,  2,   2,
+                              44,  125, 50,  38,  41,  106, 30,  5,   151, 243,
+                              238, 181, 232, 191, 161, 57,  23,  204,
+
+                              109, 39,  30,  115, 71,  15,  206, 139, 221, 5,
+                              18,  16,  93,  185, 99,  102, 205, 172, 191, 29,
+                              185, 6,   47,  84,  0,   47,  105, 203, 251, 73,
+                              196, 83,  3,   211, 32,  181, 49,  111, 114, 83,
+                              148, 232, 77,  17,  35,  2,   154, 100, 41,  135,
+                              141, 206, 56,  91,  137, 199, 104, 192, 75,  122,
+                              78,  65,  184, 69,  91,  82,  2,   172, 194, 240,
+                              49,  145, 87,  210, 97,  190, 179, 93,  125, 105,
+                              181, 207, 148, 178, 133, 53,  25,  198, 238, 151,
+                              14,  120, 213, 195, 145, 20,  122, 107, 217, 185,
+                              65,  5,   115, 110, 82,  206, 163, 86,  2,   2,
+                              44,  125, 50,  38,  41,  106, 30,  5,   151, 243,
+                              238, 181, 232, 191, 161, 57,  23,  204,
+
+                              109, 39,  30,  115, 71,  15,  206, 139, 221, 5,
+                              18,  16,  93,  185, 99,  102, 205, 172, 191, 29,
+                              185, 6,   47,  84,  0,   47,  105, 203, 251, 73,
+                              196, 83,  3,   211, 32,  181, 49,  111, 114, 83,
+                              148, 232, 77,  17,  35,  2,   154, 100, 41,  135,
+                              141, 206, 56,  91,  137, 199, 104, 192, 75,  122,
+                              78,  65,  184, 69,  91,  82,  2,   172, 194, 240,
+                              49,  145, 87,  210, 97,  190, 179, 93,  125, 105,
+                              181, 207, 148, 178, 133, 53,  25,  198, 238, 151,
+                              14,  120, 213, 195, 145, 20,  122, 107, 217, 185,
+                              65,  5,   115, 110, 82,  206, 163, 86,  2,   2,
+                              44,  125, 50,  38,  41,  106, 30,  5,   151, 243,
+                              238, 181, 232, 191, 161, 57,  23,  204}),
+                     TensorValue({4}, dtype::Int32(), {0, 14, 22, 30}),
+                     TensorValue({30}, dtype::Int32(),
+                                 {8,  16, 9, 2, 3, 10, 17, 24, 32, 25,
+                                  18, 11, 4, 5, 0, 1,  8,  16, 9,  2,
+                                  3,  10, 0, 1, 8, 16, 9,  2,  3,  10}),
+                     {}},
+            Testcase{{},
+                     {},
+                     {},
+                     TensorValue({1, 30, 1, 2}, dtype::Float32(),
+                                 {-22.850792, -97.862236,  -101.043236,
+                                  -4.727012,  28.275675,   -157.96654,
+                                  42.1377,    45.06531,    -149.77373,
+                                  24.487143,  -8.054966,   -13.990831,
+                                  -6.9395194, -3.9211385,  64.79172,
+                                  -12.363858, -47.875,     59.,
+                                  56.271786,  -62.725567,  120.522675,
+                                  16.559765,  85.74334,    112.904495,
+                                  99.375,     29.499973,   2.0220923,
+                                  -19.681704, 890.12494,   941.25,
+                                  -7.0498576, 99.47632,    -22.850792,
+                                  -97.862236, -101.043236, -4.727012,
+                                  28.275675,  -157.96654,  42.1377,
+                                  45.06531,   -149.77373,  24.487143,
+                                  -8.054966,  -13.990831,  890.12494,
+                                  941.25,     -7.0498576,  99.47632,
+                                  -22.850792, -97.862236,  -101.043236,
+                                  -4.727012,  28.275675,   -157.96654,
+                                  42.1377,    45.06531,    -149.77373,
+                                  24.487143,  -8.054966,   -13.990831})});
+}
+
+TEST_F(CUDA, DCT_WITH_MASK2) {
+    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
+    DctChannelSelectForward::Param param;
+    UniformIntRNG rng_oc(0, 3 * 64);
+    for (size_t n : {1, 3}) {
+        for (size_t ic : {1, 3}) {
+            for (size_t ih : {8, 16, 32, 512, 1024}) {
+                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
+                    int random_oc = static_cast<int>(rng_oc.gen_single_val());
+                    int max_oc = ic * 64;
+                    int mask_oc = (random_oc % max_oc) + 1;
+                    auto test_case =
+                            gen_dct_case(n, ic, ih, iw, mask_oc, param);
+                    checker.set_param(param).exect(test_case->testcase_in,
+                                                   test_case->testcase_out);
+                }
+            }
+        }
+    }
+}
+
+TEST_F(CUDA, DCT_WITH_MASK2_QINT8) {
+    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
+    DctChannelSelectForward::Param param;
+    param.format = DctChannelSelectForward::Param::Format::NCHW4;
+
+    UniformIntRNG rng_oc(0, 3 * 64);
+    for (size_t n : {1, 3}) {
+        for (size_t ic : {1, 3}) {
+            for (size_t ih : {8, 16, 32, 512, 1024}) {
+                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
+                    int random_oc = static_cast<int>(rng_oc.gen_single_val());
+                    int max_oc = ic * 64;
+                    int mask_oc = (random_oc % max_oc) + 1;
+                    mask_oc = (mask_oc + 3) / 4 * 4;
+                    auto test_case = gen_dct_case(n, ic, ih, iw, mask_oc, param,
+                                                  dtype::QuantizedS8(10.f));
+                    checker.set_param(param).set_epsilon(1).exect(
+                            test_case->testcase_in, test_case->testcase_out);
+                }
+            }
+        }
+    }
+}
+TEST_F(CUDA, DCT_WITH_MASK2_QINT8_CONSTRAINT) {
+    DctChannelSelectForward::Param param;
+    param.format = DctChannelSelectForward::Param::Format::NCHW4;
+
+    Checker<DctChannelSelectForward> checker(handle_cuda(), false);
+    checker.set_param(param)
+            .set_dtype(0, dtype::Uint8())
+            .set_dtype(1, dtype::Int32())
+            .set_dtype(2, dtype::Int32())
+            .set_dtype(3, dtype::QuantizedS8(10.f))
+            .set_epsilon(1);
+
+    UniformIntRNG rng_oc(0, 3 * 64);
+    for (size_t n : {1, 3}) {
+        for (size_t ic : {1, 3}) {
+            for (size_t ih : {8, 16, 32, 512, 1024}) {
+                for (size_t iw : {8, 16, 32, 64, 128, 256, 512, 1024}) {
+                    int random_oc = static_cast<int>(rng_oc.gen_single_val());
+                    int max_oc = ic * 64;
+                    int mask_oc = (random_oc % max_oc) + 1;
+                    mask_oc = (mask_oc + 3) / 4 * 4;
+                    if (mask_oc < max_oc) {
+                        checker
+                                .set_tensors_constraint(gen_dct_constriant(
+                                        n, ic, ih, iw, mask_oc, param))
+                                .exec({TensorShape{n, ic, ih, iw},
+                                       TensorShape{ic + 1},
+                                       TensorShape{(size_t)mask_oc},
+                                       {}});
+                    } else {
+                        checker.set_tensors_constraint({}).exec(
+                                {TensorShape{n, ic, ih, iw}, {}, {}, {}});
+                    }
+                }
+            }
+        }
+    }
+}
+
+#if MEGDNN_WITH_BENCHMARK
+
+TEST_F(CUDA, BENCHMARK_DCT) {
+    using Param = DctChannelSelectForward::Param;
+
+    auto run = [&](const TensorShapeArray& shapes, Param param) {
+        Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
+        benchmarker.set_param(param);
+        benchmarker.set_dtype(0, dtype::Uint8())
+                .set_dtype(1, dtype::Int32())
+                .set_dtype(2, dtype::Int32());
+        for (auto&& shape : shapes) {
+            double computation = double(shape[0]) * shape[1] * shape[2] *
+                                 shape[3] * 32.0 * 1e-6;
+            auto time_ms = benchmarker.execs({shape, {}, {}, {}});
+            printf("execute %s, %.4f Gops\n", shape.to_string().c_str(),
+                   computation / time_ms);
+        }
+    };
+
+    auto run_case = [&](const DctTestcase& testcase, Param param,
+                        std::string comment = "") {
+        Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
+        benchmarker.set_param(param);
+        benchmarker.set_dtype(0, dtype::Uint8())
+                .set_dtype(1, dtype::Int32())
+                .set_dtype(2, dtype::Int32())
+                .set_dtype(3, testcase.testcase_out[3].layout.dtype);
+
+        auto src_shape = testcase.testcase_in[0].layout;
+        double computation = double(src_shape[0]) * src_shape[1] *
+                             src_shape[2] * src_shape[3] * 32.0 * 1e-6;
+        auto time_ms = benchmarker.exect(testcase.testcase_in);
+        printf("[%s] execute %s, %.4f Gops\n", comment.c_str(),
+               src_shape.to_string().c_str(), computation / time_ms);
+    };
+
+    auto run_case_constraint =
+            [&](const Benchmarker<DctChannelSelectForward>::TensorsConstriant&
+                        constraint,
+                Param param, const TensorShapeArray& shapes,
+                std::string comment = "", DType output_dtype) {
+                Benchmarker<DctChannelSelectForward> benchmarker(handle_cuda());
+                benchmarker.set_param(param)
+                        .set_dtype(0, dtype::Uint8())
+                        .set_dtype(1, dtype::Int32())
+                        .set_dtype(2, dtype::Int32())
+                        .set_dtype(3, output_dtype)
+                        .set_tensors_constraint(constraint);
+
+                auto src_shape = shapes[0];
+                double computation = double(src_shape[0]) * src_shape[1] *
+                                     src_shape[2] * src_shape[3] * 32.0 * 1e-6;
+                auto time_ms = benchmarker.exec(shapes);
+                printf("[%s] execute %s, %.4f Gops\n", comment.c_str(),
+                       src_shape.to_string().c_str(), computation / time_ms);
+            };
+
+    TensorShapeArray shapes = {
+            {1, 3, 512, 512},
+            {8, 3, 2176, 3840},
+    };
+    {
+        Param param;
+        run(shapes, param);
+    }
+
+    Param fix_32_param;
+    fix_32_param.fastImpl = Param::FastImpl::FIX_32_MASK;
+    {
+        auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_param);
+        run_case(*test_case, fix_32_param, "FIX_32_MASK");
+    }
+
+    {
+        Param param;
+        auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_param);
+        run_case(*test_case, param, "MASK 32");
+    }
+
+    {
+        Param fix_32_nchw4_param;
+        fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
+        fix_32_nchw4_param.format = Param::Format::NCHW4;
+        auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_nchw4_param,
+                                      dtype::QuantizedS8(10.f));
+        run_case(*test_case, fix_32_nchw4_param, "FIX_32_MASK QINT8");
+    }
+
+    {
+        Param fix_32_nchw4_param;
+        fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
+        fix_32_nchw4_param.format = Param::Format::NCHW4;
+        auto test_case = gen_dct_case(8, 3, 2176, 3840, 32, fix_32_nchw4_param,
+                                      dtype::QuantizedS8(10.f));
+        fix_32_nchw4_param.fastImpl = Param::FastImpl::NONE;
+        run_case(*test_case, fix_32_nchw4_param, "MASK 32 QINT8");
+    }
+
+    {
+        Param fix_32_nchw4_param;
+        fix_32_nchw4_param.fastImpl = Param::FastImpl::FIX_32_MASK;
+        fix_32_nchw4_param.format = Param::Format::NCHW4;
+        TensorShapeArray shapes = {{8, 3, 2176, 3840}, {4}, {32}, {}};
+        auto constraint =
+                gen_dct_constriant(8, 3, 2176, 3840, 32, fix_32_nchw4_param);
+        run_case_constraint(constraint, fix_32_nchw4_param, shapes,
+                            "FIX_32_MASK QINT8 Constraint",
+                            dtype::QuantizedS8(10.f));
+    }
+}
+#endif
+
+}  // namespace test
+}  // namespace megdnn
+// vim: syntax=cpp.doxygen
--- a/dnn/test/naive/dct.cpp
+++ b/dnn/test/naive/dct.cpp