diff --git a/dnn/src/common/pooling.cpp b/dnn/src/common/pooling.cpp index 597fc30c33cca0998d6a8c8111082aab30eded0a..32a11d4e99ea3f8cfcf4d287e029a56cad5dfe10 100644 --- a/dnn/src/common/pooling.cpp +++ b/dnn/src/common/pooling.cpp @@ -47,7 +47,8 @@ void PoolingBase::deduce_layout_fwd(const TensorLayout& src, } else if (param().format == Param::Format::NCHW4 || param().format == Param::Format::NCHW44 || param().format == Param::Format::NCHW88 || - param().format == Param::Format::NCHW32) { + param().format == Param::Format::NCHW32 || + param().format == Param::Format::NCHW64) { megdnn_assert(src.ndim == 5_z, "%s", errmsg_c); spatial_pos = 2; @@ -82,6 +83,9 @@ void PoolingBase::deduce_layout_fwd(const TensorLayout& src, if (param().format == Param::Format::NCHW32) { c *= 32; } + if (param().format == Param::Format::NCHW64) { + c *= 64; + } size_t oh, ow; size_t fh = this->param().window_h; size_t fw = this->param().window_w; @@ -109,6 +113,8 @@ void PoolingBase::deduce_layout_fwd(const TensorLayout& src, dst = TensorLayout{{n, c / 8, oh, ow, 8}, src.dtype, src.format}; } else if (param().format == Param::Format::NCHW32) { dst = TensorLayout{{n, c / 32, oh, ow, 32}, src.dtype, src.format}; + } else if (param().format == Param::Format::NCHW64) { + dst = TensorLayout{{n, c / 64, oh, ow, 64}, src.dtype, src.format}; } else if (param().format == Param::Format::CHWN4) { dst = TensorLayout{{c / 4, oh, ow, n, 4}, src.dtype, src.format}; } else { diff --git a/dnn/src/cuda/pooling/opr_impl.cpp b/dnn/src/cuda/pooling/opr_impl.cpp index 89fbcb918a761b8739fff77a4d07f06844d48621..e88dfc61382607a3993885c2661e18e9f4ce1c29 100644 --- a/dnn/src/cuda/pooling/opr_impl.cpp +++ b/dnn/src/cuda/pooling/opr_impl.cpp @@ -9,13 +9,50 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "src/cuda/pooling/opr_impl.h" +#include "src/cuda/relayout_format/opr_impl.h" -#include "./pooling2d_int8.cuh" +#include "./pooling2d_qint.cuh" #include "src/cuda/utils.h" namespace megdnn { namespace cuda { +namespace { +inline void deduce_reformat_layout(std::unique_ptr& relayout, + const TensorLayout& src_layout, + TensorLayout& dst_layout, + RelayoutFormat::Param::Mode mode, + const int oc = 0, const int group = 1) { + if (src_layout.ndim > 0) { + RelayoutFormat::Param trans_param; + trans_param.mode = mode; + trans_param.oc = oc; + trans_param.group = group; + relayout->param() = trans_param; + relayout->deduce_layout(src_layout, dst_layout); + } else { + dst_layout = src_layout; + } +} + +void get_inner_layout(const TensorLayout& src, const TensorLayout& dst, + TensorLayout& inner_src, TensorLayout& inner_dst, + Handle* handle, + PoolingForwardImpl::Param::Format format) { + bool is_nchw = format == PoolingForwardImpl::Param::Format::NCHW; + if (src.dtype.enumv() == DTypeEnum::QuantizedS4 && + dst.dtype.enumv() == DTypeEnum::QuantizedS4 && is_nchw) { + auto relayout_opr = handle->create_operator(); + deduce_reformat_layout(relayout_opr, src, inner_src, + RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); + deduce_reformat_layout(relayout_opr, dst, inner_dst, + RelayoutFormat::Param::Mode::NCHW_NCHW64, 0, 1); + } else { + megdnn_assert(0, "not support"); + } +} + +} // namespace void PoolingForwardImpl::setup_descs(const TensorLayout& src, const TensorLayout& dst) { src_desc.set(src, param().format); @@ -28,14 +65,22 @@ WorkspaceBundle PoolingForwardImpl::get_workspace_bundle( SmallVector sizes; TensorLayout fsrc = src; TensorLayout fdst = dst; - auto get_workspace = [&sizes](TensorLayout& layout) { - if (layout.dtype == dtype::BFloat16()) { - layout.dtype = dtype::Float32(); - sizes.push_back(layout.span().dist_byte()); - } - }; - get_workspace(fsrc); - get_workspace(fdst); + bool is_nchw = param().format == Param::Format::NCHW; + if (src.dtype.enumv() == DTypeEnum::QuantizedS4 && + dst.dtype.enumv() == DTypeEnum::QuantizedS4 && is_nchw) { + get_inner_layout(src, dst, fsrc, fdst, handle(), param().format); + sizes.push_back(fsrc.span().dist_byte()); + sizes.push_back(fdst.span().dist_byte()); + } else { + auto get_workspace = [&sizes](TensorLayout& layout) { + if (layout.dtype == dtype::BFloat16()) { + layout.dtype = dtype::Float32(); + sizes.push_back(layout.span().dist_byte()); + } + }; + get_workspace(fsrc); + get_workspace(fdst); + } return {ptr, std::move(sizes)}; } @@ -44,12 +89,27 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst, check_exec(ssrc.layout, sdst.layout, sworkspace.size); TensorND src = ssrc; TensorND dst = sdst; + Param::Format inner_format = param().format; auto wsb = get_workspace_bundle(sworkspace.raw_ptr, ssrc.layout, sdst.layout); auto ctypecvt = CompTypeCvter( concrete_handle(this->handle()), &wsb); + bool is_nchw = param().format == Param::Format::NCHW; if (ssrc.layout.dtype.enumv() == DTypeTrait::enumv) { ctypecvt.src_to_comp_type(ssrc, src).src_to_comp_type(sdst, dst); + } else if (ssrc.layout.dtype.enumv() == DTypeEnum::QuantizedS4 && + sdst.layout.dtype.enumv() == DTypeEnum::QuantizedS4 && is_nchw) { + auto handle_ptr = handle(); + get_inner_layout(ssrc.layout, sdst.layout, src.layout, dst.layout, + handle_ptr, param().format); + src.raw_ptr = wsb.get(0); + dst.raw_ptr = wsb.get(1); + auto relayout_opr = handle_ptr->create_operator(); + RelayoutFormat::Param trans_param; + trans_param.mode = RelayoutFormat::Param::Mode::NCHW_NCHW64; + relayout_opr->param() = trans_param; + relayout_opr->exec(ssrc, src, {}); + inner_format = Param::Format::NCHW64; } { using Format = param::Pooling::Format; @@ -104,6 +164,34 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst, return pooling2d::do_pooling2d_int8_ncdiv32hw32( src.compatible_ptr(), dst.compatible_ptr(), kern_param, stream, static_cast(param().mode)); + } else if (param().format == Format::NCHW64 || + inner_format == Format::NCHW64) { + megdnn_assert(src.layout.dtype.enumv() == DTypeEnum::QuantizedS4, + "but %s", src.layout.dtype.name()); + pooling2d::Param kern_param; + size_t n = src.layout[0], hi = src.layout[2], wi = src.layout[3], + c = src.layout[1], ho = dst.layout[2], wo = dst.layout[3]; + c = c * 64; + size_t ph = param().pad_h, pw = param().pad_w; + size_t window_h = param().window_h, window_w = param().window_w; + size_t sh = param().stride_h, sw = param().stride_w; + kern_param.n = n, kern_param.c = c, kern_param.hi = hi, + kern_param.wi = wi, kern_param.ho = ho, kern_param.wo = wo, + kern_param.ph = ph, kern_param.pw = pw, + kern_param.window_h = window_h, kern_param.window_w = window_w, + kern_param.sh = sh, kern_param.sw = sw; + auto&& stream = cuda_stream(handle()); + pooling2d::do_pooling2d_int4_ncdiv64hw64( + (int8_t*)src.raw_ptr, (int8_t*)dst.raw_ptr, kern_param, + stream, static_cast(param().mode)); + if (sdst.layout.ndim == 4) { + auto relayout_opr = handle()->create_operator(); + RelayoutFormat::Param trans_param; + trans_param.mode = RelayoutFormat::Param::Mode::NCHW64_NCHW; + relayout_opr->param() = trans_param; + relayout_opr->exec(dst, sdst,{}); + } + return; } auto handle = cudnn_handle(this->handle()); setup_descs(src.layout, dst.layout); @@ -114,7 +202,7 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in ssrc, _megdnn_tensor_out sdst, } if (ssrc.layout.dtype.enumv() == DTypeTrait::enumv) { ctypecvt.comp_to_dst_type(dst, sdst); - } + } } void PoolingBackwardImpl::setup_descs(const TensorLayout& src, diff --git a/dnn/src/cuda/pooling/pooling2d_int8.cu b/dnn/src/cuda/pooling/pooling2d_int8.cu deleted file mode 100644 index d8b12dadac940e3a3397cd15975b0eba28c10345..0000000000000000000000000000000000000000 --- a/dnn/src/cuda/pooling/pooling2d_int8.cu +++ /dev/null @@ -1,592 +0,0 @@ -/** - * \file dnn/src/cuda/pooling/pooling2d_int8_cdiv4hwn4.cu - * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") - * - * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - */ -#include "./pooling2d_int8.cuh" -#include "src/common/opr_param_defs_enumv.cuh" -#include "src/cuda/query_blocksize.cuh" - -using namespace megdnn; -using namespace cuda; -using namespace pooling2d; - -namespace { -// common macros -#define FEED1 Base::feed(x, 0); -#define FEED2 \ - Base::feed(x.x, 0); \ - Base::feed(x.y, 4); -#define FEED4 \ - FEED2; \ - Base::feed(x.z, 8); \ - Base::feed(x.w, 12); - -#define ANS1(cb) cb(Base::res[0], Base::res[1], Base::res[2], Base::res[3], i1); - -#define ANS2(cb) \ - ANS1(cb); \ - cb(Base::res[4], Base::res[5], Base::res[6], Base::res[7], i2); - -#define ANS4(cb) \ - ANS2(cb); \ - cb(Base::res[8], Base::res[9], Base::res[10], Base::res[11], i3); \ - cb(Base::res[12], Base::res[13], Base::res[14], Base::res[15], i4); - -__device__ __forceinline__ int pack_int8_to_int8x4(int8_t x, int8_t y, int8_t z, - int8_t w) { - int ix = static_cast(x), iy = static_cast(y), - iz = static_cast(z), iw = static_cast(w); - - asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(ix) : "r"(iy)); - asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(iz) : "r"(iw)); - asm volatile("prmt.b32 %0, %0, %1, 0x5410;" : "+r"(ix) : "r"(iz)); - return ix; -} - -template -struct MaxPoolerBase; - -template -struct MaxPoolerBase { - static constexpr int nr_results = sizeof(feed_type) / sizeof(int8_t); - int8_t res[nr_results]; - - __device__ MaxPoolerBase(int) {} - __device__ __forceinline__ void init() { -#pragma unroll - for (int i = 0; i < nr_results; ++i) { - res[i] = -128; - } - } - __device__ __forceinline__ void feed(int32_t x, int idx) { - int8_t ix = (x & 0xff); - int8_t iy = ((x >> 8) & 0xff); - int8_t iz = ((x >> 16) & 0xff); - int8_t iw = ((x >> 24) & 0xff); - res[idx] = res[idx] > ix ? res[idx] : ix; - res[idx + 1] = res[idx + 1] > iy ? res[idx + 1] : iy; - res[idx + 2] = res[idx + 2] > iz ? res[idx + 2] : iz; - res[idx + 3] = res[idx + 3] > iw ? res[idx + 3] : iw; - } -}; - -template -struct MaxPooler; - -#define SPEC_WITH_FEED_TYPE(_feed_type) \ - template <> \ - struct MaxPooler : MaxPoolerBase - -#define COMMON_DEFS(_feed_type) \ - using feed_type = _feed_type; \ - using Base = MaxPoolerBase; \ - using MaxPoolerBase::MaxPoolerBase; - -#define cb(_x, _y, _z, _w, _ret) \ - { _ret = pack_int8_to_int8x4(_x, _y, _z, _w); } - -SPEC_WITH_FEED_TYPE(int32_t) { - COMMON_DEFS(int32_t); - __device__ __forceinline__ void feed(int32_t x) { FEED1; } - - __device__ __forceinline__ int get_ans() { - int i1; - ANS1(cb); - return i1; - } -}; - -SPEC_WITH_FEED_TYPE(int2) { - COMMON_DEFS(int2); - __device__ __forceinline__ void feed(int2 x) { FEED2; } - __device__ __forceinline__ int2 get_ans() { - int i1, i2; - ANS2(cb); - return ::make_int2(i1, i2); - } -}; - -SPEC_WITH_FEED_TYPE(int4) { - COMMON_DEFS(int4); - __device__ __forceinline__ void feed(int4 x) { FEED4; } - - __device__ __forceinline__ int4 get_ans() { - int i1, i2, i3, i4; - ANS4(cb); - return ::make_int4(i1, i2, i3, i4); - } -}; - -#undef cb -#undef COMMON_DEFS -#undef SPEC_WITH_FEED_TYPE - -template -struct MeanIncludeRoundedPoolerBase; - -template -struct MeanIncludeRoundedPoolerBase { - static constexpr int nr_results = sizeof(feed_type) / sizeof(int8_t); - int32_t res[nr_results]; - const int count; - const float fi_count; - - __device__ MeanIncludeRoundedPoolerBase(int count) - : count{count}, fi_count{1.f / count} {} - __device__ __forceinline__ void init() { -#pragma unroll - for (int i = 0; i < nr_results; ++i) { - res[i] = 0; - } - } - - __device__ __forceinline__ void feed(int32_t x, int idx) { - int8_t ix = (x & 0xff); - int8_t iy = ((x >> 8) & 0xff); - int8_t iz = ((x >> 16) & 0xff); - int8_t iw = ((x >> 24) & 0xff); - res[idx] += static_cast(ix); - res[idx + 1] += static_cast(iy); - res[idx + 2] += static_cast(iz); - res[idx + 3] += static_cast(iw); - } -}; - -template -struct MeanIncludeRoundedPooler; - -#define SPEC_WITH_FEED_TYPE(_feed_type) \ - template <> \ - struct MeanIncludeRoundedPooler \ - : MeanIncludeRoundedPoolerBase - -#define COMMON_DEFS(_feed_type) \ - using feed_type = _feed_type; \ - using Base = MeanIncludeRoundedPoolerBase; \ - using MeanIncludeRoundedPoolerBase::MeanIncludeRoundedPoolerBase; - -#define cb(_x, _y, _z, _w, _ret) \ - { \ - float fx = roundf(static_cast(_x) * Base::fi_count); \ - float fy = roundf(static_cast(_y) * Base::fi_count); \ - float fz = roundf(static_cast(_z) * Base::fi_count); \ - float fw = roundf(static_cast(_w) * Base::fi_count); \ - _ret = transform_float4_to_int8x4(::make_float4(fx, fy, fz, fw)); \ - } - -SPEC_WITH_FEED_TYPE(int32_t) { - COMMON_DEFS(int32_t); - __device__ __forceinline__ void feed(int32_t x) { FEED1; } - - __device__ __forceinline__ int get_ans() { - int i1; - ANS1(cb); - return i1; - } -}; - -SPEC_WITH_FEED_TYPE(int2) { - COMMON_DEFS(int2); - __device__ __forceinline__ void feed(int2 x) { FEED2; } - __device__ __forceinline__ int2 get_ans() { - int i1, i2; - ANS2(cb); - return ::make_int2(i1, i2); - } -}; - -SPEC_WITH_FEED_TYPE(int4) { - COMMON_DEFS(int4); - __device__ __forceinline__ void feed(int4 x) { FEED4; } - - __device__ __forceinline__ int4 get_ans() { - int i1, i2, i3, i4; - ANS4(cb); - return ::make_int4(i1, i2, i3, i4); - } -}; - -#undef cb -#undef COMMON_DEFS -#undef SPEC_WITH_FEED_TYPE - -template -struct MeanExcludeRoundedPoolerBase; - -template -struct MeanExcludeRoundedPoolerBase { - static const int nr_results = sizeof(feed_type) / sizeof(int8_t); - int32_t res[nr_results]; - int count; - - __device__ MeanExcludeRoundedPoolerBase(int /* count */) {} - __device__ __forceinline__ void init() { -#pragma unroll - for (int i = 0; i < nr_results; ++i) { - res[i] = 0; - } - count = 0; - } - - __device__ __forceinline__ void feed(int32_t x, int idx) { - int8_t ix = (x & 0xff); - int8_t iy = ((x >> 8) & 0xff); - int8_t iz = ((x >> 16) & 0xff); - int8_t iw = ((x >> 24) & 0xff); - res[idx] += static_cast(ix); - res[idx + 1] += static_cast(iy); - res[idx + 2] += static_cast(iz); - res[idx + 3] += static_cast(iw); - } -}; - -template -struct MeanExcludeRoundedPooler; - -#define SPEC_WITH_FEED_TYPE(_feed_type) \ - template <> \ - struct MeanExcludeRoundedPooler \ - : MeanExcludeRoundedPoolerBase - -#define COMMON_DEFS(_feed_type) \ - using feed_type = _feed_type; \ - using Base = MeanExcludeRoundedPoolerBase; \ - using MeanExcludeRoundedPoolerBase::MeanExcludeRoundedPoolerBase; - -#define cb(_x, _y, _z, _w, _ret) \ - { \ - float fx = roundf(static_cast(_x) / Base::count); \ - float fy = roundf(static_cast(_y) / Base::count); \ - float fz = roundf(static_cast(_z) / Base::count); \ - float fw = roundf(static_cast(_w) / Base::count); \ - _ret = transform_float4_to_int8x4(::make_float4(fx, fy, fz, fw)); \ - } - -SPEC_WITH_FEED_TYPE(int32_t) { - COMMON_DEFS(int32_t); - __device__ __forceinline__ void feed(int32_t x) { - FEED1; - count++; - } - - __device__ __forceinline__ int get_ans() { - int i1; - ANS1(cb); - return i1; - } -}; - -SPEC_WITH_FEED_TYPE(int2) { - COMMON_DEFS(int2); - __device__ __forceinline__ void feed(int2 x) { - FEED2; - count++; - } - __device__ __forceinline__ int2 get_ans() { - int i1, i2; - ANS2(cb); - return ::make_int2(i1, i2); - } -}; - -SPEC_WITH_FEED_TYPE(int4) { - COMMON_DEFS(int4); - __device__ __forceinline__ void feed(int4 x) { - FEED4; - count++; - } - - __device__ __forceinline__ int4 get_ans() { - int i1, i2, i3, i4; - ANS4(cb); - return ::make_int4(i1, i2, i3, i4); - } -}; - -#undef cb -#undef COMMON_DEFS -#undef SPEC_WITH_FEED_TYPE - -template -__global__ void pooling2d_device_template_int8_cdiv4hwn4( - const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) { - const int tidx = threadIdx.x; - const int tidy = threadIdx.y; - const int bidx = blockIdx.x; - const int bidy = blockIdx.y; - const int bidz = blockIdx.z; - - using ldg_type = typename Pooler::feed_type; - static int constexpr pack_size = 4; - static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t); - const int batch = (bidy * blockDim.x + tidx) * ldg_width; - const int packed_ch = bidz * blockDim.y + tidy; - const int npack = param.n * pack_size; - if (batch >= param.n || packed_ch >= param.c / pack_size) - return; - - const int ho = bidx / param.wo; - const int wo = bidx - param.wo * ho; - const int input_pixels = param.hi * param.wi; - const int output_pixels = param.ho * param.wo; - const int8_t* __restrict__ g_src_ptr = - src + batch * pack_size + packed_ch * input_pixels * npack; - int8_t* __restrict__ g_dst_ptr = dst + batch * pack_size + - packed_ch * output_pixels * npack + - (ho * param.wo + wo) * npack; - - Pooler pooler(param.window_h * param.window_w); - pooler.init(); - for (int fh = 0; fh < param.window_h; fh++) { - uint32_t ih = ho * param.sh + fh - param.ph; - for (int fw = 0; fw < param.window_w; fw++) { - uint32_t iw = wo * param.sw + fw - param.pw; - if (ih < param.hi && iw < param.wi) { - const int8_t* __restrict__ cur_src_ptr = - g_src_ptr + (ih * param.wi + iw) * npack; - ldg_type sval = - __ldg(reinterpret_cast(cur_src_ptr)); - pooler.feed(sval); - } - } - } - ldg_type res = pooler.get_ans(); - *(reinterpret_cast(g_dst_ptr)) = res; -} - -template -__global__ void pooling2d_device_template_int8_ncdiv4hw4( - const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - - using ldg_type = typename Pooler::feed_type; - static int constexpr pack_size = 4; - static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t); - MEGDNN_STATIC_ASSERT( - ldg_width == 1, - "pooling2d (NCHW4) kernel must use 32bit width ldg instruction"); - const int wo_ldg = param.wo / ldg_width; - const int c_packed = param.c / pack_size; - const int batch = tid / (param.ho * wo_ldg * c_packed); - const int chw = tid - batch * param.ho * wo_ldg * c_packed; - const int oc_packed = chw / (param.ho * wo_ldg); - const int hw = chw - oc_packed * param.ho * wo_ldg; - const int oh = hw / wo_ldg; - const int ow = (hw - wo_ldg * oh) * ldg_width; - if (batch >= param.n || oc_packed >= c_packed || oh >= param.ho || - ow >= param.wo) - return; - - const int in_batch_stride = param.hi * param.wi * param.c; - const int out_batch_stride = param.ho * param.wo * param.c; - const int in_channel_stride = param.hi * param.wi * pack_size; - const int out_channel_stride = param.ho * param.wo * pack_size; - const int8_t* __restrict__ g_src_ptr = - src + batch * in_batch_stride + oc_packed * in_channel_stride; - int8_t* __restrict__ g_dst_ptr = dst + batch * out_batch_stride + - oc_packed * out_channel_stride + - (oh * param.wo + ow) * pack_size; - - Pooler pooler(param.window_h * param.window_w); - pooler.init(); - for (int fh = 0; fh < param.window_h; fh++) { - uint32_t ih = oh * param.sh + fh - param.ph; - for (int fw = 0; fw < param.window_w; fw++) { - uint32_t iw = ow * param.sw + fw - param.pw; - if (ih < param.hi && iw < param.wi) { - const int8_t* __restrict__ cur_src_ptr = - g_src_ptr + (ih * param.wi + iw) * pack_size; - ldg_type sval = __ldg(reinterpret_cast(cur_src_ptr)); - pooler.feed(sval); - } - } - } - ldg_type res = pooler.get_ans(); - *(reinterpret_cast(g_dst_ptr)) = res; -} - -template -__global__ void pooling2d_device_template_int8_ncdiv32hw32( - const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) { - const int tid = blockIdx.x * blockDim.x + threadIdx.x; - - using ldg_type = typename Pooler::feed_type; - static int constexpr pack_size = 32; - static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t); - static int constexpr ldg_width_bytes = sizeof(ldg_type); - static int constexpr section = pack_size / sizeof(ldg_type); - MEGDNN_STATIC_ASSERT( - ldg_width == 4, - "pooling2d (NCHW32) kernel must use 128bit width ldg instruction"); - const int c_packed = param.c / pack_size; - const int batch = tid / (param.ho * param.wo * c_packed * section); - const int batch_residual = - tid - batch * param.ho * param.wo * c_packed * section; - const int oc = batch_residual / (param.ho * param.wo * section); - const int oc_residual = batch_residual - oc * param.ho * param.wo * section; - const int oh = oc_residual / (param.wo * section); - const int oh_residual = (oc_residual - oh * param.wo * section); - const int ow = oh_residual / section; - const int sec = oh_residual - ow * section; - if (batch >= param.n || oc >= c_packed || oh >= param.ho || ow >= param.wo) - return; - - const int in_batch_stride = param.hi * param.wi * param.c; - const int out_batch_stride = param.ho * param.wo * param.c; - const int in_channel_stride = param.hi * param.wi * pack_size; - const int out_channel_stride = param.ho * param.wo * pack_size; - const int8_t* __restrict__ g_src_ptr = src + batch * in_batch_stride + - oc * in_channel_stride + - sec * ldg_width_bytes; - int8_t* __restrict__ g_dst_ptr = - dst + batch * out_batch_stride + oc * out_channel_stride + - (oh * param.wo + ow) * pack_size + sec * ldg_width_bytes; - - Pooler pooler(param.window_h * param.window_w); - pooler.init(); - for (int fh = 0; fh < param.window_h; fh++) { - uint32_t ih = oh * param.sh + fh - param.ph; - for (int fw = 0; fw < param.window_w; fw++) { - uint32_t iw = ow * param.sw + fw - param.pw; - if (ih < param.hi && iw < param.wi) { - const int8_t* __restrict__ cur_src_ptr = - g_src_ptr + (ih * param.wi + iw) * pack_size; - ldg_type sval = - __ldg(reinterpret_cast(cur_src_ptr)); - pooler.feed(sval); - } - } - } - ldg_type res = pooler.get_ans(); - *(reinterpret_cast(g_dst_ptr)) = res; -} - -}; // namespace - -void megdnn::cuda::pooling2d::do_pooling2d_int8_cdiv4hwn4(const int8_t* d_src, - int8_t* d_dst, - const Param& param, - cudaStream_t stream, - uint32_t mode) { - using Mode = megdnn::param_enumv::Pooling::Mode; - void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); - uint32_t vthreads_x = 0, vthreads_y = param.c / 4; -#define dispatch_pooling_mode(_feed_type) \ - switch (mode) { \ - case Mode::MAX: \ - kern = pooling2d_device_template_int8_cdiv4hwn4< \ - MaxPooler>; \ - break; \ - case Mode::AVERAGE: \ - kern = pooling2d_device_template_int8_cdiv4hwn4< \ - MeanIncludeRoundedPooler>; \ - break; \ - case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: \ - kern = pooling2d_device_template_int8_cdiv4hwn4< \ - MeanExcludeRoundedPooler>; \ - break; \ - default: \ - megdnn_assert(false, "invalid pooling mode"); \ - } - if (param.n % 4 == 0) { - dispatch_pooling_mode(int4); - vthreads_x = param.n / 4; - } else if (param.n % 2 == 0) { - dispatch_pooling_mode(int2); - vthreads_x = param.n / 2; - } else { - dispatch_pooling_mode(int32_t); - vthreads_x = param.n; - } -#undef dispatch_pooling_mode - constexpr uint32_t threads_x = 16; - uint32_t nr_threads = query_blocksize_for_kernel(kern); - uint32_t nr_threads_x = std::min(threads_x, vthreads_x), - nr_threads_y = std::min(nr_threads / nr_threads_x, vthreads_y); - uint32_t nr_blocks_x = param.ho * param.wo, - nr_blocks_y = DIVUP(vthreads_x, nr_threads_x), - nr_blocks_z = DIVUP(vthreads_y, nr_threads_y); - dim3 threads{nr_threads_x, nr_threads_y, 1}; - dim3 blocks{nr_blocks_x, nr_blocks_y, nr_blocks_z}; - kern<<>>(d_src, d_dst, param); - after_kernel_launch(); -} - -void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv4hw4(const int8_t* d_src, - int8_t* d_dst, - const Param& param, - cudaStream_t stream, - uint32_t mode) { - using Mode = megdnn::param_enumv::Pooling::Mode; - void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); - uint32_t vthreads = param.n * param.c * param.ho * param.wo / 4; - switch (mode) { - case Mode::MAX: - kern = pooling2d_device_template_int8_ncdiv4hw4< - MaxPooler>; - break; - case Mode::AVERAGE: - kern = pooling2d_device_template_int8_ncdiv4hw4< - MeanIncludeRoundedPooler>; - break; - case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: - kern = pooling2d_device_template_int8_ncdiv4hw4< - MeanExcludeRoundedPooler>; - break; - default: - megdnn_assert(false, "invalid pooling mode"); - } - uint32_t nr_threads = query_blocksize_for_kernel(kern); - nr_threads = std::min(nr_threads, vthreads); - uint32_t nr_blocks = DIVUP(vthreads, nr_threads); - kern<<>>(d_src, d_dst, param); - after_kernel_launch(); -} - -void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv32hw32(const int8_t* d_src, - int8_t* d_dst, - const Param& param, - cudaStream_t stream, - uint32_t mode) { - using Mode = megdnn::param_enumv::Pooling::Mode; - void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); - uint32_t vthreads = param.n * param.c * param.ho * param.wo / 16; - switch (mode) { - case Mode::MAX: - kern = pooling2d_device_template_int8_ncdiv32hw32< - MaxPooler>; - break; - case Mode::AVERAGE: - kern = pooling2d_device_template_int8_ncdiv32hw32< - MeanIncludeRoundedPooler>; - break; - case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: - kern = pooling2d_device_template_int8_ncdiv32hw32< - MeanExcludeRoundedPooler>; - break; - default: - megdnn_assert(false, "invalid pooling mode"); - } - uint32_t nr_threads = query_blocksize_for_kernel(kern); - nr_threads = std::min(nr_threads, vthreads); - uint32_t nr_blocks = DIVUP(vthreads, nr_threads); - kern<<>>(d_src, d_dst, param); - after_kernel_launch(); -} - -#undef FEED1 -#undef FEED2 -#undef FEED3 -#undef ANS1 -#undef ANS2 -#undef ANS4 - -// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/pooling/pooling2d_qint.cu b/dnn/src/cuda/pooling/pooling2d_qint.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a648951fb0857c22ce499797e7ef7d034e432b2 --- /dev/null +++ b/dnn/src/cuda/pooling/pooling2d_qint.cu @@ -0,0 +1,540 @@ +/** + * \file dnn/src/cuda/pooling/pooling2d_qint.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "./pooling2d_qint.cuh" +#include "src/common/opr_param_defs_enumv.cuh" +#include "src/cuda/query_blocksize.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace pooling2d; + +namespace { +__device__ __forceinline__ int pack_int8_to_int8x4(int8_t x, int8_t y, int8_t z, + int8_t w) { + int ix = static_cast(x), iy = static_cast(y), + iz = static_cast(z), iw = static_cast(w); + + asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(ix) : "r"(iy)); + asm volatile("prmt.b32 %0, %0, %1, 0x1140;" : "+r"(iz) : "r"(iw)); + asm volatile("prmt.b32 %0, %0, %1, 0x5410;" : "+r"(ix) : "r"(iz)); + return ix; +} + +template +__device__ __forceinline__ OutDtype pack_int8(int8_t (&x)[regs]); + +template <> +__device__ __forceinline__ int pack_int8<4, int8_t, int>(int8_t (&x)[4]) { + return pack_int8_to_int8x4(x[0], x[1], x[2], x[3]); +} + +template <> +__device__ __forceinline__ int2 pack_int8<8, int8_t, int2>(int8_t (&x)[8]) { + int8_t x0[4]{x[0], x[1], x[2], x[3]}; + int8_t x1[4]{x[4], x[5], x[6], x[7]}; + return ::make_int2(pack_int8<4, int8_t, int>(x0), + pack_int8<4, int8_t, int>(x1)); +} + +template <> +__device__ __forceinline__ int4 pack_int8<16, int8_t, int4>(int8_t (&x)[16]) { + int8_t x0[4]{x[0], x[1], x[2], x[3]}; + int8_t x1[4]{x[4], x[5], x[6], x[7]}; + int8_t x2[4]{x[8], x[9], x[10], x[11]}; + int8_t x3[4]{x[12], x[13], x[14], x[15]}; + return ::make_int4( + pack_int8<4, int8_t, int>(x0), pack_int8<4, int8_t, int>(x1), + pack_int8<4, int8_t, int>(x2), pack_int8<4, int8_t, int>(x3)); +} + +__device__ __forceinline__ int8_t pack_int8_to_int4x2(int8_t x0, int8_t x1) { + return (x0 & 0xf) | (x1 << 4); +} +template <> +__device__ __forceinline__ int pack_int8<8, dt_qint4, int>(int8_t (&x)[8]) { + int8_t x0 = pack_int8_to_int4x2(x[0], x[1]); + int8_t x1 = pack_int8_to_int4x2(x[2], x[3]); + int8_t x2 = pack_int8_to_int4x2(x[4], x[5]); + int8_t x3 = pack_int8_to_int4x2(x[6], x[7]); + return pack_int8_to_int8x4(x0, x1, x2, x3); +} +template <> +__device__ __forceinline__ int4 pack_int8<32, dt_qint4, int4>(int8_t (&x)[32]) { + int8_t x0[8]{x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7]}; + int8_t x1[8]{x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]}; + int8_t x2[8]{x[16], x[17], x[18], x[19], x[20], x[21], x[22], x[23]}; + int8_t x3[8]{x[24], x[25], x[26], x[27], x[28], x[29], x[30], x[31]}; + return ::make_int4( + pack_int8<8, dt_qint4, int>(x0), pack_int8<8, dt_qint4, int>(x1), + pack_int8<8, dt_qint4, int>(x2), pack_int8<8, dt_qint4, int>(x3)); +} + +template +struct TypeTrait; + +template <> +struct TypeTrait { + static constexpr int bit_width = 8; + static constexpr int mask = 0xff; + static constexpr int8_t min = -128; + static constexpr int elem_per_32bit = 32 / bit_width; + static constexpr int shift_fix_sign = 0; +}; + +template <> +struct TypeTrait { + static constexpr int bit_width = 4; + static constexpr int mask = 0xf; + static constexpr int8_t min = -8; + static constexpr int elem_per_32bit = 32 / bit_width; + static constexpr int shift_fix_sign = 4; +}; + +template +struct MaxPooler { + using feed_type = _feed_type; + static constexpr int bit_width = TypeTrait::bit_width; + static constexpr int nr_results = sizeof(feed_type) * 8 / bit_width; + static constexpr int elem_per_32bit = TypeTrait::elem_per_32bit; + static constexpr int shift_fix_sign = TypeTrait::shift_fix_sign; + int8_t res[nr_results]; + + __device__ MaxPooler(int) {} + __device__ __forceinline__ void init() { +#pragma unroll + for (int i = 0; i < nr_results; ++i) { + res[i] = TypeTrait::min; + } + } + __device__ __forceinline__ void feed(int x, int idx = 0) { + constexpr int unroll_n = sizeof(int) * 8 / bit_width; +#pragma unroll + for (int i = 0; i < unroll_n; i++) { + int8_t temp = ((x >> (i * bit_width)) & TypeTrait::mask) + << shift_fix_sign; + temp = temp >> shift_fix_sign; + res[idx + i] = res[idx + i] > temp ? res[idx + i] : temp; + } + } + __device__ __forceinline__ void feed(int2 x) { + feed(x.x, 0 * elem_per_32bit); + feed(x.y, 1 * elem_per_32bit); + } + __device__ __forceinline__ void feed(int4 x) { + feed(x.x, 0 * elem_per_32bit); + feed(x.y, 1 * elem_per_32bit); + feed(x.z, 2 * elem_per_32bit); + feed(x.w, 3 * elem_per_32bit); + } + __device__ __forceinline__ feed_type get_ans() { + feed_type ans; + ans = pack_int8(res); + return ans; + } +}; + +template +struct MeanIncludeRoundedPooler { + using feed_type = _feed_type; + static constexpr int bit_width = TypeTrait::bit_width; + static constexpr int nr_results = sizeof(feed_type) * 8 / bit_width; + static constexpr int elem_per_32bit = TypeTrait::elem_per_32bit; + static constexpr int shift_fix_sign = TypeTrait::shift_fix_sign; + + int32_t res[nr_results]; + const int count; + const float fi_count; + + __device__ MeanIncludeRoundedPooler(int count) + : count{count}, fi_count{1.f / count} {} + + __device__ __forceinline__ void init() { +#pragma unroll + for (int i = 0; i < nr_results; ++i) { + res[i] = 0; + } + } + __device__ __forceinline__ void feed(int x, int idx = 0) { + constexpr int unroll_n = sizeof(int) * 8 / bit_width; +#pragma unroll + for (int i = 0; i < unroll_n; i++) { + int8_t temp = ((x >> (i * bit_width)) & TypeTrait::mask) + << shift_fix_sign; + temp = temp >> shift_fix_sign; + res[idx + i] += static_cast(temp); + } + } + __device__ __forceinline__ void feed(int2 x) { + feed(x.x, 0 * elem_per_32bit); + feed(x.y, 1 * elem_per_32bit); + } + __device__ __forceinline__ void feed(int4 x) { + feed(x.x, 0 * elem_per_32bit); + feed(x.y, 1 * elem_per_32bit); + feed(x.z, 2 * elem_per_32bit); + feed(x.w, 3 * elem_per_32bit); + } + __device__ __forceinline__ feed_type get_ans() { + feed_type ans; + int8_t out_res[nr_results]; +#pragma unroll + for (int i = 0; i < nr_results; i++) { + float f32_res = roundf(static_cast(res[i]) * fi_count); + int i8_res; + asm volatile("cvt.rni.s8.f32 %0, %1;" + : "=r"(i8_res) + : "f"(f32_res)); + out_res[i] = i8_res; + } + ans = pack_int8(out_res); + return ans; + } +}; + +template +struct MeanExcludeRoundedPooler { + using feed_type = _feed_type; + static constexpr int bit_width = TypeTrait::bit_width; + static constexpr int nr_results = sizeof(feed_type) * 8 / bit_width; + static constexpr int elem_per_32bit = TypeTrait::elem_per_32bit; + static constexpr int shift_fix_sign = TypeTrait::shift_fix_sign; + int32_t res[nr_results]; + int count; + __device__ MeanExcludeRoundedPooler(int) {} + + __device__ __forceinline__ void init() { +#pragma unroll + for (int i = 0; i < nr_results; ++i) { + res[i] = 0; + } + count = 0; + } + __device__ __forceinline__ void feed(int x, int idx) { + constexpr int unroll_n = sizeof(int) * 8 / bit_width; +#pragma unroll + for (int i = 0; i < unroll_n; i++) { + int8_t temp = ((x >> (i * bit_width)) & TypeTrait::mask) + << shift_fix_sign; + temp = temp >> shift_fix_sign; + res[idx + i] += static_cast(temp); + } + } + __device__ __forceinline__ void feed(int x) { + feed(x, 0); + count++; + } + + __device__ __forceinline__ void feed(int2 x) { + feed(x.x, 0 * elem_per_32bit); + feed(x.y, 1 * elem_per_32bit); + count++; + } + __device__ __forceinline__ void feed(int4 x) { + feed(x.x, 0 * elem_per_32bit); + feed(x.y, 1 * elem_per_32bit); + feed(x.z, 2 * elem_per_32bit); + feed(x.w, 3 * elem_per_32bit); + count++; + } + __device__ __forceinline__ feed_type get_ans() { + feed_type ans; + int8_t out_res[nr_results]; +#pragma unroll + for (int i = 0; i < nr_results; i++) { + float f32_res = roundf(static_cast(res[i]) / count); + int i8_res; + asm volatile("cvt.rni.s8.f32 %0, %1;" + : "=r"(i8_res) + : "f"(f32_res)); + out_res[i] = i8_res; + } + ans = pack_int8(out_res); + return ans; + } +}; + +template +__global__ void pooling2d_device_template_int8_cdiv4hwn4( + const int8_t* __restrict__ src, int8_t* __restrict__ dst, Param param) { + const int tidx = threadIdx.x; + const int tidy = threadIdx.y; + const int bidx = blockIdx.x; + const int bidy = blockIdx.y; + const int bidz = blockIdx.z; + + using ldg_type = typename Pooler::feed_type; + static int constexpr pack_size = 4; + static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t); + const int batch = (bidy * blockDim.x + tidx) * ldg_width; + const int packed_ch = bidz * blockDim.y + tidy; + const int npack = param.n * pack_size; + if (batch >= param.n || packed_ch >= param.c / pack_size) + return; + + const int ho = bidx / param.wo; + const int wo = bidx - param.wo * ho; + const int input_pixels = param.hi * param.wi; + const int output_pixels = param.ho * param.wo; + const int8_t* __restrict__ g_src_ptr = + src + batch * pack_size + packed_ch * input_pixels * npack; + int8_t* __restrict__ g_dst_ptr = dst + batch * pack_size + + packed_ch * output_pixels * npack + + (ho * param.wo + wo) * npack; + + Pooler pooler(param.window_h * param.window_w); + pooler.init(); + for (int fh = 0; fh < param.window_h; fh++) { + uint32_t ih = ho * param.sh + fh - param.ph; + for (int fw = 0; fw < param.window_w; fw++) { + uint32_t iw = wo * param.sw + fw - param.pw; + if (ih < param.hi && iw < param.wi) { + const int8_t* __restrict__ cur_src_ptr = + g_src_ptr + (ih * param.wi + iw) * npack; + ldg_type sval = + __ldg(reinterpret_cast(cur_src_ptr)); + pooler.feed(sval); + } + } + } + ldg_type res = pooler.get_ans(); + *(reinterpret_cast(g_dst_ptr)) = res; +} + +template +__global__ void pooling2d_device_template_nchwc(const int8_t* __restrict__ src, + int8_t* __restrict__ dst, + Param param) { + const int tid = blockIdx.x * blockDim.x + threadIdx.x; + using ldg_type = typename Pooler::feed_type; + static int constexpr ldg_width = sizeof(ldg_type) / sizeof(int32_t); + static int constexpr ldg_width_bytes = sizeof(ldg_type); + static int constexpr section = pack_byte / sizeof(ldg_type); + MEGDNN_STATIC_ASSERT( + ldg_width == ldg_width_assert, + "pooling2d (NCHW64) kernel must use 128bit width ldg instruction"); + const int c_packed = param.c / pack_size; + const int batch = tid / (param.ho * param.wo * c_packed * section); + const int batch_residual = + tid - batch * param.ho * param.wo * c_packed * section; + const int oc = batch_residual / (param.ho * param.wo * section); + const int oc_residual = batch_residual - oc * param.ho * param.wo * section; + const int oh = oc_residual / (param.wo * section); + const int oh_residual = (oc_residual - oh * param.wo * section); + const int ow = oh_residual / section; + const int sec = oh_residual - ow * section; + if (batch >= param.n || oc >= c_packed || oh >= param.ho || ow >= param.wo) + return; + + const int in_batch_stride = + param.hi * param.wi * param.c * pack_byte / pack_size; + const int out_batch_stride = + param.ho * param.wo * param.c * pack_byte / pack_size; + const int in_channel_stride = param.hi * param.wi * pack_byte; + const int out_channel_stride = param.ho * param.wo * pack_byte; + const int8_t* __restrict__ g_src_ptr = + src + (batch * in_batch_stride + oc * in_channel_stride + + sec * ldg_width_bytes); + int8_t* __restrict__ g_dst_ptr = + dst + (batch * out_batch_stride + oc * out_channel_stride + + (oh * param.wo + ow) * pack_byte + sec * ldg_width_bytes); + + Pooler pooler(param.window_h * param.window_w); + pooler.init(); + for (int fh = 0; fh < param.window_h; fh++) { + uint32_t ih = oh * param.sh + fh - param.ph; + for (int fw = 0; fw < param.window_w; fw++) { + uint32_t iw = ow * param.sw + fw - param.pw; + if (ih < param.hi && iw < param.wi) { + const int8_t* __restrict__ cur_src_ptr = + g_src_ptr + (ih * param.wi + iw) * pack_byte; + ldg_type sval = + __ldg(reinterpret_cast(cur_src_ptr)); + pooler.feed(sval); + } + } + } + ldg_type res = pooler.get_ans(); + *(reinterpret_cast(g_dst_ptr)) = res; +} + +}; // namespace + +void megdnn::cuda::pooling2d::do_pooling2d_int8_cdiv4hwn4(const int8_t* d_src, + int8_t* d_dst, + const Param& param, + cudaStream_t stream, + uint32_t mode) { + using Mode = megdnn::param_enumv::Pooling::Mode; + void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); + uint32_t vthreads_x = 0, vthreads_y = param.c / 4; +#define dispatch_pooling_mode(_feed_type) \ + switch (mode) { \ + case Mode::MAX: \ + kern = pooling2d_device_template_int8_cdiv4hwn4< \ + MaxPooler>; \ + break; \ + case Mode::AVERAGE: \ + kern = pooling2d_device_template_int8_cdiv4hwn4< \ + MeanIncludeRoundedPooler>; \ + break; \ + case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: \ + kern = pooling2d_device_template_int8_cdiv4hwn4< \ + MeanExcludeRoundedPooler>; \ + break; \ + default: \ + megdnn_assert(false, "invalid pooling mode"); \ + } + if (param.n % 4 == 0) { + dispatch_pooling_mode(int4); + vthreads_x = param.n / 4; + } else if (param.n % 2 == 0) { + dispatch_pooling_mode(int2); + vthreads_x = param.n / 2; + } else { + dispatch_pooling_mode(int32_t); + vthreads_x = param.n; + } +#undef dispatch_pooling_mode + constexpr uint32_t threads_x = 16; + uint32_t nr_threads = query_blocksize_for_kernel(kern); + uint32_t nr_threads_x = std::min(threads_x, vthreads_x), + nr_threads_y = std::min(nr_threads / nr_threads_x, vthreads_y); + uint32_t nr_blocks_x = param.ho * param.wo, + nr_blocks_y = DIVUP(vthreads_x, nr_threads_x), + nr_blocks_z = DIVUP(vthreads_y, nr_threads_y); + dim3 threads{nr_threads_x, nr_threads_y, 1}; + dim3 blocks{nr_blocks_x, nr_blocks_y, nr_blocks_z}; + kern<<>>(d_src, d_dst, param); + after_kernel_launch(); +} + +void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv4hw4(const int8_t* d_src, + int8_t* d_dst, + const Param& param, + cudaStream_t stream, + uint32_t mode) { + using Mode = megdnn::param_enumv::Pooling::Mode; + void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); + constexpr int ldg_byte = 4; + constexpr int elem_per_byte = 1; + constexpr int pack_size = 4; + constexpr int pack_byte = pack_size / elem_per_byte; + constexpr int elem_per_thread = ldg_byte * elem_per_byte; + constexpr int ldg_assert_width = ldg_byte / sizeof(int32_t); + uint32_t vthreads = + param.n * param.c * param.ho * param.wo / elem_per_thread; + switch (mode) { + case Mode::MAX: + kern = pooling2d_device_template_nchwc, + pack_size, pack_byte, + ldg_assert_width>; + break; + case Mode::AVERAGE: + kern = pooling2d_device_template_nchwc< + MeanIncludeRoundedPooler, + pack_size, pack_byte, ldg_assert_width>; + break; + case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: + kern = pooling2d_device_template_nchwc< + MeanExcludeRoundedPooler, + pack_size, pack_byte, ldg_assert_width>; + break; + default: + megdnn_assert(false, "invalid pooling mode"); + } + uint32_t nr_threads = query_blocksize_for_kernel(kern); + nr_threads = std::min(nr_threads, vthreads); + uint32_t nr_blocks = DIVUP(vthreads, nr_threads); + kern<<>>(d_src, d_dst, param); + after_kernel_launch(); +} + +void megdnn::cuda::pooling2d::do_pooling2d_int8_ncdiv32hw32(const int8_t* d_src, + int8_t* d_dst, + const Param& param, + cudaStream_t stream, + uint32_t mode) { + using Mode = megdnn::param_enumv::Pooling::Mode; + void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); + constexpr int ldg_byte = 16; + constexpr int elem_per_byte = 1; + constexpr int pack_size = 32; + constexpr int pack_byte = pack_size / elem_per_byte; + constexpr int elem_per_thread = ldg_byte * elem_per_byte; + uint32_t vthreads = + param.n * param.c * param.ho * param.wo / elem_per_thread; + switch (mode) { + case Mode::MAX: + kern = pooling2d_device_template_nchwc, + pack_size, pack_byte>; + break; + case Mode::AVERAGE: + kern = pooling2d_device_template_nchwc< + MeanIncludeRoundedPooler, pack_size, + pack_byte>; + break; + case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: + kern = pooling2d_device_template_nchwc< + MeanExcludeRoundedPooler, pack_size, + pack_byte>; + break; + default: + megdnn_assert(false, "invalid pooling mode"); + } + uint32_t nr_threads = query_blocksize_for_kernel(kern); + nr_threads = std::min(nr_threads, vthreads); + uint32_t nr_blocks = DIVUP(vthreads, nr_threads); + kern<<>>(d_src, d_dst, param); + after_kernel_launch(); +} + +void megdnn::cuda::pooling2d::do_pooling2d_int4_ncdiv64hw64(const int8_t* d_src, + int8_t* d_dst, + const Param& param, + cudaStream_t stream, + uint32_t mode) { + using Mode = megdnn::param_enumv::Pooling::Mode; + void (*kern)(const int8_t* __restrict__, int8_t* __restrict__, Param param); + constexpr int ldg_byte = 16; + constexpr int elem_per_byte = 2; + constexpr int pack_size = 64; + constexpr int pack_byte = pack_size / elem_per_byte; + constexpr int elem_per_thread = ldg_byte * elem_per_byte; + uint32_t vthreads = + param.n * param.c * param.ho * param.wo / elem_per_thread; + switch (mode) { + case Mode::MAX: + kern = pooling2d_device_template_nchwc, + pack_size, pack_byte>; + break; + case Mode::AVERAGE: + kern = pooling2d_device_template_nchwc< + MeanIncludeRoundedPooler, + pack_size, pack_byte>; + break; + case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: + kern = pooling2d_device_template_nchwc< + MeanExcludeRoundedPooler, + pack_size, pack_byte>; + break; + default: + megdnn_assert(false, "invalid pooling mode"); + } + uint32_t nr_threads = query_blocksize_for_kernel(kern); + nr_threads = std::min(nr_threads, vthreads); + uint32_t nr_blocks = DIVUP(vthreads, nr_threads); + kern<<>>(d_src, d_dst, param); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/pooling/pooling2d_int8.cuh b/dnn/src/cuda/pooling/pooling2d_qint.cuh similarity index 83% rename from dnn/src/cuda/pooling/pooling2d_int8.cuh rename to dnn/src/cuda/pooling/pooling2d_qint.cuh index b5554ba8f3f10bc6aa8d6766ca6094890e89c7ec..ed24e526e2c817193cdac320e0d0a1dad10b1f7a 100644 --- a/dnn/src/cuda/pooling/pooling2d_int8.cuh +++ b/dnn/src/cuda/pooling/pooling2d_qint.cuh @@ -1,5 +1,5 @@ /** - * \file dnn/src/cuda/pooling/pooling2d_int8.cuh + * \file dnn/src/cuda/pooling/pooling2d_qint.cuh * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. @@ -32,6 +32,11 @@ void do_pooling2d_int8_ncdiv4hw4(const int8_t* d_src, int8_t* d_dst, void do_pooling2d_int8_ncdiv32hw32(const int8_t* d_src, int8_t* d_dst, const Param& param, cudaStream_t stream, uint32_t mode); + +void do_pooling2d_int4_ncdiv64hw64(const int8_t* d_src, int8_t* d_dst, + const Param& param, cudaStream_t stream, + uint32_t mode); + } // namespace pooling2d } // namespace cuda } // namespace megdnn diff --git a/dnn/src/naive/pooling/opr_impl.cpp b/dnn/src/naive/pooling/opr_impl.cpp index ed978826507c32b1270f5c77c7991499ed9a338b..902780c41bd627c690a72ade5635574a51e234d5 100644 --- a/dnn/src/naive/pooling/opr_impl.cpp +++ b/dnn/src/naive/pooling/opr_impl.cpp @@ -15,6 +15,7 @@ #include "megdnn/dtype.h" #include "src/common/utils.h" #include "src/naive/handle.h" +#include "src/naive/lowbit_utils.h" #include "midout.h" MIDOUT_DECL(megdnn_naive_pooling) @@ -190,6 +191,12 @@ struct NCHW32IdxGetter { return (((n * (C >> 5) + (c >> 5)) * H + h) * W + w) * 32 + (c & 0x1f); } }; +struct NCHW64IdxGetter { + static size_t get_idx(size_t n, size_t c, size_t h, size_t w, size_t, + size_t C, size_t H, size_t W) { + return (((n * (C >> 6) + (c >> 6)) * H + h) * W + w) * 64 + (c & 0x3f); + } +}; /*! * Pooler for AVERAGE_COUNT_EXCLUDE_PADDING mode */ @@ -375,15 +382,81 @@ void pooling_backward_max_impl(const ctype* __restrict src, namespace megdnn { namespace naive { +WorkspaceBundle PoolingForwardImpl::get_workspace_bundle( + void* ptr, const TensorLayout& src, const TensorLayout& dst) const { + SmallVector sizes; + TensorLayout fsrc = src; + TensorLayout fdst = dst; + auto get_workspace = [&sizes](TensorLayout& layout) { + if (layout.dtype.enumv() == DTypeEnum::Quantized4Asymm || + layout.dtype.enumv() == DTypeEnum::QuantizedS4) { + layout.dtype = dtype::Int8(); + layout.format = TensorLayout::Format(layout.dtype); + sizes.push_back(layout.span().dist_byte()); + } + }; + get_workspace(fsrc); + get_workspace(fdst); + return {ptr, std::move(sizes)}; +}; + +size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, + const TensorLayout& dst) { + return get_workspace_bundle(nullptr, src, dst).total_size_in_bytes(); +} +namespace { + +void post_process(const TensorND& dst, TensorND& comp_dst, Handle* handle, + WorkspaceBundle& workspace_bundle) { + if (dst.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { + int8_to_int4(comp_dst, dst); + } else if (dst.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { + uint8_to_uint4(comp_dst, dst); + } +} + +} // namespace + void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) { check_exec(src.layout, dst.layout, workspace.size); + TensorND comp_src = src; + TensorND comp_dst = dst; + + auto wsb = get_workspace_bundle(workspace.raw_ptr, src.layout, dst.layout); + if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4) { + float scale = src.layout.dtype.param().scale; + comp_src.layout.dtype = dtype::QuantizedS8(scale); + comp_src.layout.init_contiguous_stride(); + comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); + comp_src.raw_ptr = wsb.get(0); + comp_dst.layout.dtype = dtype::QuantizedS8(scale); + comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); + comp_dst.layout.init_contiguous_stride(); + comp_dst.raw_ptr = wsb.get(1); + int4_to_int8(src, comp_src); + } else if (src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) { + float scale = src.layout.dtype.param().scale; + uint8_t zero_point = + src.layout.dtype.param().zero_point; + comp_src.layout.dtype = dtype::Quantized8Asymm(scale, zero_point); + comp_src.layout.format = TensorLayout::Format(comp_src.layout.dtype); + comp_src.layout.init_contiguous_stride(); + comp_src.raw_ptr = wsb.get(0); + comp_dst.layout.dtype = dtype::Quantized8Asymm(scale, zero_point); + comp_dst.layout.format = TensorLayout::Format(comp_dst.layout.dtype); + comp_dst.layout.init_contiguous_stride(); + comp_dst.raw_ptr = wsb.get(1); + uint4_to_uint8(src, comp_src); + } + size_t c_pos, spatial_pos, batch_pos = 0; if (param().format == Param::Format::NCHW || param().format == Param::Format::NCHW4 || param().format == Param::Format::NCHW88 || param().format == Param::Format::NCHW44 || - param().format == Param::Format::NCHW32) { + param().format == Param::Format::NCHW32 || + param().format == Param::Format::NCHW64) { c_pos = 1; spatial_pos = 2; } else if (param().format == Param::Format::NHWC) { @@ -398,27 +471,35 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, c_pos = 2; spatial_pos = 1; } - size_t N = src.layout.shape[batch_pos], C = src.layout.shape[c_pos], - IH = src.layout.shape[spatial_pos + 0], - IW = src.layout.shape[spatial_pos + 1]; - size_t OH = dst.layout.shape[spatial_pos + 0], - OW = dst.layout.shape[spatial_pos + 1]; - if (param().format == Param::Format::NHWCD4) { - C *= 4; - IW = src.layout.shape[spatial_pos + 2]; - OW = dst.layout.shape[spatial_pos + 2]; - } - if (param().format == Param::Format::NCHW4 || - param().format == Param::Format::NCHW44 || - param().format == Param::Format::CHWN4) { - C *= 4; - } - if (param().format == Param::Format::NCHW88) { - C *= 8; - } - if (param().format == Param::Format::NCHW32) { - C *= 32; + size_t N = comp_src.layout.shape[batch_pos], + C = comp_src.layout.shape[c_pos], + IH = comp_src.layout.shape[spatial_pos + 0], + IW = comp_src.layout.shape[spatial_pos + 1]; + size_t OH = comp_dst.layout.shape[spatial_pos + 0], + OW = comp_dst.layout.shape[spatial_pos + 1]; + switch (param().format) { + case Param::Format::NHWCD4: + C *= 4; + IW = comp_src.layout.shape[spatial_pos + 2]; + OW = comp_dst.layout.shape[spatial_pos + 2]; + break; + case Param::Format::NCHW4: + case Param::Format::NCHW44: + case Param::Format::CHWN4: + C *= 4; + break; + case Param::Format::NCHW88: + C *= 8; + break; + case Param::Format::NCHW32: + C *= 32; + break; + case Param::Format::NCHW64: + C *= 64; + break; + default:; } + size_t PH = param().pad_h, PW = param().pad_w; size_t FH = param().window_h, FW = param().window_w; size_t SH = param().stride_h, SW = param().stride_w; @@ -427,8 +508,8 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, MEGDNN_DISPATCH_CPU_KERN( \ static_cast(handle()), \ pooling_forward_impl( \ - sptr, dptr, src.layout.dtype, N, C, IH, IW, OH, OW, \ - PH, PW, SH, SW, FH, FW)); \ + sptr, dptr, comp_src.layout.dtype, N, C, IH, IW, OH, \ + OW, PH, PW, SH, SW, FH, FW)); \ } \ MIDOUT_END(); @@ -455,6 +536,9 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, case Param::Format::NCHW32: \ DISPATCH_WITH_POOLER_AND_IDX_GETTER(Pooler, NCHW32IdxGetter); \ break; \ + case Param::Format::NCHW64: \ + DISPATCH_WITH_POOLER_AND_IDX_GETTER(Pooler, NCHW64IdxGetter); \ + break; \ case Param::Format::CHWN4: \ DISPATCH_WITH_POOLER_AND_IDX_GETTER(Pooler, CHWN4IdxGetter); \ break; \ @@ -462,30 +546,35 @@ void PoolingForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, megdnn_throw("invalid pooling format"); \ } -#define cb(DType) \ - if (src.layout.dtype.enumv() == DTypeTrait::enumv) { \ - using ctype = typename DTypeTrait::ctype; \ - switch (param().mode) { \ - case Mode::MAX: { \ - auto sptr = src.ptr(); \ - auto dptr = dst.ptr(); \ - DISPATCH_WITH_POOLER(MaxPooler); \ - return; \ - } \ - case Mode::AVERAGE: { \ - auto sptr = src.ptr(); \ - auto dptr = dst.ptr(); \ - DISPATCH_WITH_POOLER(MeanIncludePooler); \ - return; \ - } \ - case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: { \ - auto sptr = src.ptr(); \ - auto dptr = dst.ptr(); \ - DISPATCH_WITH_POOLER(MeanExcludePooler); \ - return; \ - } \ - } \ +#define cb(DType) \ + if (comp_src.layout.dtype.enumv() == DTypeTrait::enumv) { \ + using ctype = typename DTypeTrait::ctype; \ + switch (param().mode) { \ + case Mode::MAX: { \ + auto sptr = comp_src.ptr(); \ + auto dptr = comp_dst.ptr(); \ + DISPATCH_WITH_POOLER(MaxPooler); \ + break; \ + } \ + case Mode::AVERAGE: { \ + auto sptr = comp_src.ptr(); \ + auto dptr = comp_dst.ptr(); \ + DISPATCH_WITH_POOLER(MeanIncludePooler); \ + break; \ + } \ + case Mode::AVERAGE_COUNT_EXCLUDE_PADDING: { \ + auto sptr = comp_src.ptr(); \ + auto dptr = comp_dst.ptr(); \ + DISPATCH_WITH_POOLER(MeanExcludePooler); \ + break; \ + } \ + default: \ + megdnn_assert(0, "not support mode"); \ + } \ + post_process(dst, comp_dst, handle(), wsb); \ + return; \ } + MEGDNN_FOREACH_COMPUTING_DTYPE(cb) MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) #undef cb diff --git a/dnn/src/naive/pooling/opr_impl.h b/dnn/src/naive/pooling/opr_impl.h index ed56fb646b7ea1b9dc51d9c904a40ac70ba74f8a..1590785195dff32d3a82432a2eaa1a3af07e80ed 100644 --- a/dnn/src/naive/pooling/opr_impl.h +++ b/dnn/src/naive/pooling/opr_impl.h @@ -20,10 +20,12 @@ class PoolingForwardImpl: public PoolingForward { using PoolingForward::PoolingForward; void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) override; - size_t get_workspace_in_bytes(const TensorLayout &, - const TensorLayout &) override { - return 0; - } + size_t get_workspace_in_bytes(const TensorLayout&, + const TensorLayout&) override; + + private: + WorkspaceBundle get_workspace_bundle(void* ptr, const TensorLayout&, + const TensorLayout&) const; }; class PoolingBackwardImpl : public PoolingBackward { diff --git a/dnn/test/common/checker.h b/dnn/test/common/checker.h index a175b3671d39d4d1bc256ff64ff7e845f240745c..e727cf622d67df313096b395055dc0f59b0fd32b 100644 --- a/dnn/test/common/checker.h +++ b/dnn/test/common/checker.h @@ -414,34 +414,34 @@ TensorND TensorValue(const TensorShape& shape, T dtype, template TensorND TensorValueLowbit4(const TensorShape& shape, T dtype, - std::vector values) { + std::vector values) { TensorND tensor; tensor.layout = {shape, dtype}; tensor.raw_ptr = static_cast(malloc(tensor.layout.span().dist_byte())); megdnn_assert(values.size() == tensor.layout.total_nr_elems()); auto ptr = tensor.ptr::ctype>(); - size_t i; - for (i = 0; i + 1 < values.size(); i += 2) { - U val0 = values[i], val1 = values[i + 1]; - megdnn_assert(val0 >= DTypeTrait::min()); - megdnn_assert(val1 <= DTypeTrait::max()); - ptr[i / 2] = typename DTypeTrait::ctype((val0 & 0xF) | (val1 << 4)); - } - if (i < values.size()) { - U val0 = values[i]; - megdnn_assert(val0 >= DTypeTrait::min() && - val0 <= DTypeTrait::max()); - if (i + 1 < values.size()) { - U val1 = values[i + 1]; - megdnn_assert(val1 >= DTypeTrait::min() && - val1 <= DTypeTrait::max()); - ptr[i / 2] = typename DTypeTrait::ctype((val0 & 0xF) | (val1 << 4)); - } else { - ptr[i / 2] = typename DTypeTrait::ctype(val0 & 0xF); + auto layout = tensor.layout; + auto dim_in = shape[layout.ndim - 1]; + auto elems = tensor.layout.total_nr_elems(); + auto dim_out = elems / dim_in; + auto stride_out = div_ceil(dim_in, 2_z); + size_t in_offset = 0; + for (size_t i = 0; i < dim_out; ++i) { + for (size_t j = 0; j < dim_in; j += 2) { + U a = values[in_offset + j]; + U b = 0; + if (j + 1 < dim_in) + b = values[in_offset + j + 1]; + megdnn_assert(a >= DTypeTrait::min()); + megdnn_assert(a <= DTypeTrait::max()); + megdnn_assert(b >= DTypeTrait::min()); + megdnn_assert(b <= DTypeTrait::max()); + ptr[j / 2] = (a & 0xF) | (b << 4); } + in_offset += dim_in; + ptr += stride_out; } - return tensor; } diff --git a/dnn/test/cuda/pooling.cpp b/dnn/test/cuda/pooling.cpp index 8fcf9f5abe51f0f3f700be6f998b0252c88d3bcd..a98b8028f51a1b2d9c25281f989b8767ff14ee51 100644 --- a/dnn/test/cuda/pooling.cpp +++ b/dnn/test/cuda/pooling.cpp @@ -242,6 +242,20 @@ TEST_F(CUDA, POOLING_BACKWARD) .exec(TensorShapeArray{ilayout, olayout, olayout, ilayout}); } } +TEST_F(CUDA, POOLING_FORWARD_NCHW_Q4) { + require_compute_capability(7, 5); + using Param = param::Pooling; + Checker checker(handle_cuda()); + Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2}; + checker.set_dtype(0, dtype::QuantizedS4(0.1f)); + param.format = Param::Format::NCHW; + checker.set_epsilon(1 + 1e-3); + checker.set_param(param).exec({{20, 64, 22, 33}, {}}); + param.mode = Param::Mode::AVERAGE; + checker.set_param(param).exec({{20, 64, 22, 33}, {}}); + param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; + checker.set_param(param).exec({{20, 64, 22, 33}, {}}); +} TEST_F(CUDA, POOLING_FORWARD_NCHW4) { require_compute_capability(7, 5); @@ -252,6 +266,10 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW4) { param.format = Param::Format::NCHW4; checker.set_epsilon(1 + 1e-3); checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); + param.mode = Param::Mode::AVERAGE; + checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); + param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; + checker.set_param(param).exec({{20, 3, 50, 50, 4}, {}}); } #if CUDNN_VERSION >= 7500 @@ -267,9 +285,29 @@ TEST_F(CUDA, POOLING_FORWARD_NCHW32) { param.format = Param::Format::NCHW32; checker.set_epsilon(1e-3).set_rng(0, &int_rng); checker.set_param(param).exec({{64, 8, 28, 28, 32}, {}}); + param.mode = Param::Mode::AVERAGE; + checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}}); + param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; + checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}}); } #endif +TEST_F(CUDA, POOLING_FORWARD_NCHW64) { + require_compute_capability(7, 5); + using Param = param::Pooling; + Checker checker(handle_cuda()); + Param param{Param::Mode::MAX, 0, 0, 2, 2, 2, 2}; + UniformIntRNG int_rng{-8, 7}; + checker.set_dtype(0, dtype::QuantizedS4(1.f)); + param.format = Param::Format::NCHW64; + checker.set_epsilon(1e-3).set_rng(0, &int_rng); + checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}}); + param.mode = Param::Mode::AVERAGE; + checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}}); + param.mode = Param::Mode::AVERAGE_COUNT_EXCLUDE_PADDING; + checker.set_param(param).exec({{64, 8, 28, 28, 64}, {}}); +} + TEST_F(CUDA, POOLING_FORWARD_CHWN4) { require_compute_capability(6, 1); using Param = param::Pooling; diff --git a/dnn/test/naive/pooling.cpp b/dnn/test/naive/pooling.cpp index 803b9c16b71e0146acdae2f4ab067bbbdefd6150..e8208ab901c66cc19bd54656dfc5d8a22246b32f 100644 --- a/dnn/test/naive/pooling.cpp +++ b/dnn/test/naive/pooling.cpp @@ -50,4 +50,63 @@ TEST_F(NAIVE, POOLING_QUANTIZED) { 12306, 23333})}); } +TEST_F(NAIVE, POOLING_QUANTIZED_Q4) { + using Mode = Pooling::Param::Mode; + + Checker checker(handle(), /* check_dispatch */ false); + + { + auto q4_dt = dtype::QuantizedS4(1.f); + std::vector i8_src_vec{1, 2, 3, + 4, 5, 6, + 7, -1, -2}; + std::vector i8_max_dst_vec{1, 3, 7, 6}; + + std::vector i8_avg_dst_vec{0, 1, 3, 2}; + std::vector i8_avg_exclu_dst_vec{1, 3, 6, 2}; + Pooling::Param param{Mode::MAX, 1, 1, 2, 2, 2, 2}; + Testcase input{TensorValueLowbit4({1, 1, 3, 3}, q4_dt, i8_src_vec), {}}; + + checker.set_param(param).exect( + input, Testcase{{}, + TensorValueLowbit4({1, 1, 2, 2}, q4_dt, + i8_max_dst_vec)}); + param = {Mode::AVERAGE, 1, 1, 2, 2, 2, 2}; + checker.set_param(param).exect( + input, Testcase{{}, + TensorValueLowbit4({1, 1, 2, 2}, q4_dt, + i8_avg_dst_vec)}); + param = {Mode::AVERAGE_COUNT_EXCLUDE_PADDING, 1, 1, 2, 2, 2, 2}; + checker.set_param(param).exect( + input, Testcase{{}, + TensorValueLowbit4({1, 1, 2, 2}, q4_dt, + i8_avg_exclu_dst_vec)}); + } + + { + auto u4_dt = dtype::Quantized4Asymm(1.f, 0); + std::vector u8_src_vec{1, 2, 3, + 4, 5, 6, + 7, 8, 9}; + std::vector u8_max_dst_vec{1, 3, 7, 9}; + std::vector u8_avg_dst_vec{0, 1, 3, 7}; + std::vector u8_avg_exclu_dst_vec{1, 3, 6, 7}; + Pooling::Param param{Mode::MAX, 1, 1, 2, 2, 2, 2}; + Testcase input{TensorValueLowbit4({1, 1, 3, 3}, u4_dt, u8_src_vec), {}}; + checker.set_param(param).exect( + input, Testcase{{}, + TensorValueLowbit4({1, 1, 2, 2}, u4_dt, + u8_max_dst_vec)}); + param = {Mode::AVERAGE, 1, 1, 2, 2, 2, 2}; + checker.set_param(param).exect( + input, Testcase{{}, + TensorValueLowbit4({1, 1, 2, 2}, u4_dt, + u8_avg_dst_vec)}); + param = {Mode::AVERAGE_COUNT_EXCLUDE_PADDING, 1, 1, 2, 2, 2, 2}; + checker.set_param(param).exect( + input, Testcase{{}, + TensorValueLowbit4({1, 1, 2, 2}, u4_dt, + u8_avg_exclu_dst_vec)}); + } +} // vim: syntax=cpp.doxygen