diff --git a/.gitattributes b/.gitattributes index ca42b36f94b6c841a299ec08a48b290d241c61f7..43ba47de5d7af4343823719253235591d9d6fe9a 100644 --- a/.gitattributes +++ b/.gitattributes @@ -6,6 +6,7 @@ dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary dnn/src/cuda/sass/prebuilt/map_defs.cpp binary dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary +dnn/src/cuda/elemwise_multi_type/kimpl/* binary tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text ci/resource/models/float/mobilenet_v2.pkl filter=lfs diff=lfs merge=lfs -text diff --git a/dnn/include/megdnn/basic_types.h b/dnn/include/megdnn/basic_types.h index a01cea1c1c423766c6243615cfebbdc9a6764c51..2019f9362d3e51ce901a1bbde28fd1fac18e31f3 100644 --- a/dnn/include/megdnn/basic_types.h +++ b/dnn/include/megdnn/basic_types.h @@ -382,6 +382,9 @@ struct TensorLayout : public TensorShape { //! get lowest and highest offset reachable from this layout Span span() const; + + //! total number of access bytes + size_t access_bytes() const; }; /** diff --git a/dnn/include/megdnn/dtype.h b/dnn/include/megdnn/dtype.h index 539934ce9dfb3ddc7890d572cc9c26e96bff4356..74101e3ae8c8330b6dbc26b03a4228c9851ac417 100644 --- a/dnn/include/megdnn/dtype.h +++ b/dnn/include/megdnn/dtype.h @@ -308,6 +308,8 @@ class dt_qulowbit { return _; } + MEGDNN_DEVICE uint8_t as_storage() const { return _; } + MEGDNN_HOST MEGDNN_DEVICE explicit dt_qulowbit(uint8_t val):_(val) {} #ifdef MEGDNN_CC_HOST explicit operator uint8_t() { return _; } @@ -332,6 +334,8 @@ class dt_qlowbit { return _; } + MEGDNN_DEVICE int8_t as_storage() const { return _; } + MEGDNN_HOST MEGDNN_DEVICE explicit dt_qlowbit(int8_t val):_(val) {} #ifdef MEGDNN_CC_HOST explicit operator int8_t() { return _; } diff --git a/dnn/scripts/gen_elemwise_multi_type_utils.py b/dnn/scripts/gen_elemwise_multi_type_utils.py index 7279c61f8cc41366acb3b8ae468936db8bb88554..ffccb1c8aed8b5ce9a2a3545d7fe5aa8ec2abc92 100755 --- a/dnn/scripts/gen_elemwise_multi_type_utils.py +++ b/dnn/scripts/gen_elemwise_multi_type_utils.py @@ -1,6 +1,10 @@ # As cuda currently do not support quint8, so we just ignore it. SUPPORT_DTYPES = [('dt_qint8', 'dt_qint8')] -SUPPORT_QINT32_DTYPES = [('dt_qint32', 'dt_qint8'), ('dt_qint8', 'dt_qint32')] +SUPPORT_QINT32_DTYPES = [('dt_qint32', 'dt_qint8'), ('dt_qint8', 'dt_qint32'), + ('dt_qint4', 'dt_qint32'), ('dt_quint4', 'dt_qint32')] + +SUPPORT_DTYPES_Q4 = [('dt_qint4', 'dt_qint4'), ('dt_quint4', 'dt_quint4')] +SUPPORT_QINT32_DTYPES_Q4 = [('dt_qint32', 'dt_qint4'), ('dt_qint32', 'dt_quint4')] MODES = { 1: ['RELU', 'ABS', 'NEGATE', 'ACOS', 'ASIN', 'CEIL', 'COS', @@ -16,6 +20,15 @@ MODES = { 3: ['COND_LEQ_MOV', 'FUSE_MUL_ADD3'], } +QINT4_MODES = { + 1: ['RELU', 'ABS', 'NEGATE', 'CEIL', 'FLOOR', 'SIGMOID', + 'TANH', 'FAST_TANH', 'ROUND', 'H_SWISH'], + 2: ['ADD', 'MAX', 'MIN', 'MUL', 'SUB', 'SWITCH_GT0', + 'LT', 'LEQ', 'EQ', 'FUSE_ADD_RELU', 'FUSE_ADD_TANH', + 'FUSE_ADD_SIGMOID', 'FUSE_ADD_H_SWISH'], + 3: ['COND_LEQ_MOV', 'FUSE_MUL_ADD3'], +} + QINT32_MODES = { 1: ['RELU', 'SIGMOID', 'TANH', 'FAST_TANH', 'H_SWISH'], 2: ['ADD', 'FUSE_ADD_RELU', 'FUSE_ADD_SIGMOID', diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp index d2e96c9310cb5969720c975b4a507f903d863c7c..d6800bd15264992d281de4515926e23c93747777 100644 --- a/dnn/src/common/basic_types.cpp +++ b/dnn/src/common/basic_types.cpp @@ -212,7 +212,7 @@ TensorLayout::TensorLayout(const TensorShape& shape, DType dtype, TensorLayout::TensorLayout(const TensorShape& shape, const std::vector& stride, DType dtype) - : TensorLayout(shape, stride, dtype, DefaultTensorFormat::make()) {} + : TensorLayout(shape, stride, dtype, Format(dtype)) {} TensorLayout::TensorLayout(const TensorShape& shape, const std::vector& stride, DType dtype, @@ -412,6 +412,27 @@ TensorLayout::Span TensorLayout::span() const { return format.impl()->span_spec(*this); } +size_t TensorLayout::access_bytes() const { + megdnn_assert(dtype.valid()); + auto contig = collapse_contiguous(); + size_t ret = 0; + if (dtype.is_low_bit()) { + ret = 1; + int align_size_in_elements = 8 / dtype.low_bit(); + for (size_t i = 0; i < contig.ndim; ++i) { + if (contig.stride[i] == 1) { + ret *= round_up((int)contig.shape[i], align_size_in_elements); + } else { + ret *= contig.shape[i]; + } + } + ret /= align_size_in_elements; + } else { + ret = dtype.size(total_nr_elems()); + } + return ret; +} + TensorLayout TensorLayout::broadcast(const TensorShape& tshape) const { megdnn_throw_if(!ndim || !tshape.ndim, tensor_reshape_error, "broadcast involves empty tensor"); diff --git a/dnn/src/cuda/elemwise_helper.cpp b/dnn/src/cuda/elemwise_helper.cpp index 65cf17782deb088902b48b21a0617206f9f1530f..12fb03403f175f7d72c4e6285882b0a79a682994 100644 --- a/dnn/src/cuda/elemwise_helper.cpp +++ b/dnn/src/cuda/elemwise_helper.cpp @@ -236,33 +236,66 @@ INST(dt_qint8); INST(dt_quint8); #undef dt_ibyte +template +void ParamElemVisitor4bitBase::host_init( + const TensorND& rv, int /*grid_size*/, int /*block_size*/) { + m_ptr = reinterpret_cast(rv.raw_ptr); + for (size_t i = 0; i < rv.layout.ndim; ++i) { + m_stride[i] = rv.layout.stride[i]; + m_shape[i] = rv.layout.shape[i]; + if (i + 1 < rv.layout.ndim) { + m_shape_highdim[i] = rv.layout.shape[i + 1]; + if (rv.layout.stride[i + 1] == 1) + m_align_shape_highdim[i] = + (uint32_t)round_up((int)rv.layout.shape[i + 1], 2); + else + m_align_shape_highdim[i] = rv.layout.shape[i + 1]; + } + } + for (size_t i = rv.layout.ndim - 1; i < ndim - 1; ++i) { + m_shape_highdim[i] = 1; + m_align_shape_highdim[i] = 1; + } + for (size_t i = rv.layout.ndim; i < ndim; ++i) { + m_stride[i] = 0; + m_shape[i] = 1; + } + m_is_physical_contiguous = rv.layout.is_physical_contiguous(); +} + +#define ndim_cb(_ndim) \ + template class ParamElemVisitor4bitBase<_ndim, BCAST_OTHER>; +MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb) +#undef ndim_cb + } // namespace elemwise_intl void elemwise_intl::get_launch_spec(const void* kern, size_t size, int* grid_size, int* block_size) { - safe_size_in_kern(size); - auto config = query_launch_config_for_kernel(kern); - *block_size = config.block_size; - int a = size / (config.block_size * 2), - b = (size - 1) / (config.block_size * 3) + 1; - if (current_device_prop().major <= 3) { - // for Kepler, less blocks (more work per thread) is faster - *grid_size = b; - } else { - *grid_size = std::max(a, b); + safe_size_in_kern(size); + auto config = query_launch_config_for_kernel(kern); + *block_size = config.block_size; + int a = size / (config.block_size * 2), + b = (size - 1) / (config.block_size * 3) + 1; + if (current_device_prop().major <= 3) { + // for Kepler, less blocks (more work per thread) is faster + *grid_size = b; + } else { + *grid_size = std::max(a, b); + } + if (!*grid_size) { + *block_size = std::min(std::max(size / 64, 1) * 32, 1024); + *grid_size = std::max(size / *block_size, 1); + } + // because we unroll 3 times in the kernel + megdnn_assert(static_cast(*block_size) * *grid_size * 3 >= + size); } - if (!*grid_size) { - *block_size = std::min(std::max(size / 64, 1) * 32, 1024); - *grid_size = std::max(size / *block_size, 1); - } - // because we unroll 3 times in the kernel - megdnn_assert(static_cast(*block_size) * *grid_size * 3 >= size); -} -void elemwise_intl::on_bad_ndim(int ndim) { - megdnn_throw(ssprintf("invalid ndim: %d", ndim)); - MEGDNN_MARK_USED_VAR(ndim); -} + void elemwise_intl::on_bad_ndim(int ndim) { + megdnn_throw(ssprintf("invalid ndim: %d", ndim)); + MEGDNN_MARK_USED_VAR(ndim); + } } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh index d95d49f1873a97b164fa075d5ca867df30705bcd..3b62312b0f265ffddc6d04eeecef152c2fd896d5 100644 --- a/dnn/src/cuda/elemwise_helper.cuh +++ b/dnn/src/cuda/elemwise_helper.cuh @@ -115,6 +115,34 @@ INST(dt_qint32, int4); #undef as_raw #undef INST +struct int4bx2 { + int8_t x; +}; + +struct uint4bx2 { + uint8_t x; +}; + +#define INST(_ctype, _Storage, _vect_type) \ + template <> \ + class VectTypeTrait<_ctype> { \ + public: \ + using Storage = _Storage; \ + static const Storage kMask = 0xf; \ + static const Storage kBits = 4; \ + using vect_type = _vect_type; \ + static const size_t packed_size = 2; \ + static __device__ __forceinline__ vect_type make_vector(Storage x, \ + Storage y) { \ + vect_type t; \ + t.x = (x & kMask) | (y << kBits); \ + return t; \ + } \ + } +INST(dt_qint4, int8_t, int4bx2); +INST(dt_quint4, uint8_t, uint4bx2); +#undef INST + /*! * \brief visitor to access an elemeent in a tensor at given logic index * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait) @@ -217,6 +245,7 @@ template class ParamElemVisitor : public ParamVisitorBase { public: + using CType = ctype; PARAM_ELEM_VISITOR_COMMON_HOST void host_init(const TensorND& rv, int grid_size, int block_size) { @@ -500,6 +529,177 @@ public: #endif }; +template +class ParamElemVisitor4bitBase; + +template +class ParamElemVisitor4bitBase { + using Storage = int8_t; + +protected: + Storage* __restrict m_ptr; + int m_stride[ndim]; + int m_shape[ndim]; + bool m_is_physical_contiguous; + + //! m_shape_highdim[i] = original_shape[i + 1] +#ifdef _MSC_VER + Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1]; + Uint32Fastdiv m_align_shape_highdim[ndim > 1 ? ndim - 1 : 1]; +#else + Uint32Fastdiv m_shape_highdim[ndim]; + Uint32Fastdiv m_align_shape_highdim[ndim]; +#endif + +public: + static const Storage kMask = 0xf; + static const Storage kBits = 4; + static const int NDIM = ndim; + void host_init(const TensorND& rv, int grid_size, int block_size); + +#if MEGDNN_CC_CUDA + devfunc void thread_init(uint32_t) {} + + devfunc void next() {} + + devfunc void get_shape_from_access(uint32_t access_idx, + int (&shape_idx)[ndim]) { +#pragma unroll + for (int i = ndim - 1; i >= 1; --i) { + Uint32Fastdiv& align_shp = m_align_shape_highdim[i - 1]; + uint32_t access_idx_div = access_idx / align_shp; + shape_idx[i] = access_idx - access_idx_div * align_shp.divisor(); + access_idx = access_idx_div; + } + shape_idx[0] = access_idx; + } + + devfunc int offset(uint32_t idx) { + int offset = 0; +#pragma unroll + for (int i = ndim - 1; i >= 1; --i) { + Uint32Fastdiv& shp = m_shape_highdim[i - 1]; + uint32_t idx_div = idx / shp; + offset += (idx - idx_div * shp.divisor()) * m_stride[i]; + idx = idx_div; + } + offset += idx * m_stride[0]; + return offset; + } + + devfunc int idx(uint32_t access_idx) { + int idx = 0; + if (m_is_physical_contiguous) { + idx = access_idx; + } else { + int shape_idx[ndim]; + bool valid = true; + get_shape_from_access(access_idx, shape_idx); +#pragma unroll + for (int i = 0; i < ndim; ++i) { + valid &= (shape_idx[i] < m_shape[i]); + } +#pragma unroll + for (int i = 0; i < ndim - 1; ++i) { + idx = (idx + shape_idx[i]) * m_shape[i + 1]; + } + idx = valid ? idx + shape_idx[ndim - 1] : -1; + } + return idx; + } + + devfunc Storage* ptr() { return m_ptr; } +#endif +}; + +template +class ParamElemVisitor + : public ParamElemVisitor4bitBase { + using CType = dt_qint4; + using Storage = int8_t; + +public: + static const int packed_size = 1; + using Super = ParamElemVisitor4bitBase; + + void host_init(const TensorND& rv, int grid_size, int block_size) { + Super::host_init(rv, grid_size, block_size); + } + +#if MEGDNN_CC_CUDA + // cannot be l-value, only support read + devfunc dt_qint4 at(uint32_t idx) { + int offset_ = Super::offset(idx); + int vec_idx = offset_ >> 1; + int lane_idx = offset_ & 0x1; + + Storage item = Storage(unpack_integer_4bits( + *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4)); + + dt_qint4 result(item); + + return result; + } +#endif +}; + +template +class ParamElemVisitor + : public ParamElemVisitor4bitBase { + using CType = dt_quint4; + using Storage = uint8_t; + using Super = ParamElemVisitor4bitBase; + +public: + static const int packed_size = 1; + + void host_init(const TensorND& rv, int grid_size, int block_size) { + Super::host_init(rv, grid_size, block_size); + } + +#if MEGDNN_CC_CUDA + // cannot be l-value, only support read + devfunc dt_quint4 at(uint32_t idx) { + int offset_ = Super::offset(idx); + int vec_idx = offset_ >> 1; + int lane_idx = offset_ & 0x1; + + Storage item = Storage(unpack_integer_4bits( + *(Storage*)&Super::m_ptr[vec_idx], lane_idx * 4)); + + dt_quint4 result(item); + + return result; + } +#endif +}; + +#if MEGDNN_CC_CUDA +#define DEVICE_WRAPPER(x) x +#else +#define DEVICE_WRAPPER(x) +#endif + +#define INST_DT_IBYTE(ctype) \ + template \ + class ParamVectVisitor \ + : public ParamElemVisitor4bitBase { \ + public: \ + using Super = ParamElemVisitor4bitBase; \ + void host_init(const TensorND& rv, int grid_size, int block_size) { \ + Super::host_init(rv, grid_size, block_size); \ + } \ + using rwtype = typename VectTypeTrait::vect_type; \ + static const int packed_size = VectTypeTrait::packed_size; \ + DEVICE_WRAPPER(devfunc rwtype& at(uint32_t access_idx) { \ + return *(rwtype*)(&Super::m_ptr[access_idx]); \ + }) \ + }; +INST_DT_IBYTE(dt_qint4); +INST_DT_IBYTE(dt_quint4); +#undef DEVICE_WRAPPER +#undef INST_DT_IBYTE + /* f}}} */ #if MEGDNN_CC_CUDA @@ -507,7 +707,8 @@ public: /* f{{{ user operator callers */ /* - * OpCaller is used to invoke user operator with loaded element arguments. + * OpCaller is used to invoke user operator with loaded element + * arguments. * * device interface: * void thread_init(uint32_t idx); @@ -518,8 +719,8 @@ public: */ /*! - * \brief call user op directly without visiting any params (i.e. arity == - * 0) + * \brief call user op directly without visiting any params (i.e. arity + * == 0) */ template struct OpCallerNull { @@ -1151,6 +1352,20 @@ public: } }; +#define INST_DT_TYPE(ctype) \ + template \ + class UserOpInvoker \ + : public UserOpInvokerToSameNdim { \ + public: \ + UserOpInvoker(const ElemwiseOpParamN<2>& param, cudaStream_t stream, \ + const Op& op) \ + : UserOpInvokerToSameNdim(param, stream, op) {} \ + } + +INST_DT_TYPE(dt_qint4); +INST_DT_TYPE(dt_quint4); +#undef INST_DT_TYPE + #define DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, \ _stride) \ DEFINE_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, _stride) \ @@ -1404,7 +1619,6 @@ void run_elemwise(const ElemwiseOpParamN& param, cudaStream_t stream, #define INST_RUN_ELEMWISE(Op, ctype, arity) \ template void run_elemwise( \ const ElemwiseOpParamN&, cudaStream_t, const Op&) - #endif } // namespace cuda diff --git a/dnn/src/cuda/elemwise_helper_q4.cuh b/dnn/src/cuda/elemwise_helper_q4.cuh new file mode 100644 index 0000000000000000000000000000000000000000..af21f72508b3d638e75be89d1b1ac36b456b10c2 --- /dev/null +++ b/dnn/src/cuda/elemwise_helper_q4.cuh @@ -0,0 +1,256 @@ +/** + * \file dnn/src/cuda/elemwise_helper_q4.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once + +#include "src/cuda/elemwise_helper.cuh" + +/* + * please note that all arithmetics on GPU are 32-bit for best performance; this + * limits max possible size + */ + +namespace megdnn { +namespace cuda { + +template +struct IsNotTypeQ4 { + static constexpr bool value = !(std::is_same::value || + std::is_same::value); +}; + +template +struct IsTypeQ4 { + static constexpr bool value = (std::is_same::value || + std::is_same::value); +}; + +//! internals for element-wise +namespace elemwise_intl { +#define devfunc __device__ __forceinline__ + +#if MEGDNN_CC_CUDA +/*! + * \brief call an operator whose each param are promted to the same ndim and + * brdcast_mask + * \tparam PVis ParamElemVisitor class + */ +template +struct OpCallerToQ4; + +//! specialization for arity == 1 +template +struct OpCallerToQ4 { + Op op; + PVisSrc par_src[1]; + PVisDst par_dst[1]; + using src_ctype = typename PVisSrc::CType; + + devfunc void on(uint32_t access_idx) { + int32_t idx0 = par_dst[0].idx(access_idx * 2); + int32_t idx1 = par_dst[0].idx(access_idx * 2 + 1); + src_ctype src0 = (idx0 >= 0) ? par_src[0].at(idx0) : (src_ctype)0; + src_ctype src1 = (idx1 >= 0) ? par_src[0].at(idx1) : (src_ctype)0; + op(access_idx, src0, src1); + } +}; +//! specialization for arity == 2 +template +struct OpCallerToQ4 { + Op op; + PVisSrc par_src[2]; + PVisDst par_dst[1]; + using src_ctype = typename PVisSrc::CType; + + devfunc void on(uint32_t access_idx) { + int32_t idx0 = par_dst[0].idx(access_idx * 2); + int32_t idx1 = par_dst[0].idx(access_idx * 2 + 1); + src_ctype src00 = (idx0 >= 0) ? par_src[0].at(idx0) : (src_ctype)0; + src_ctype src10 = (idx0 >= 0) ? par_src[1].at(idx0) : (src_ctype)0; + src_ctype src01 = (idx0 >= 0) ? par_src[0].at(idx1) : (src_ctype)0; + src_ctype src11 = (idx0 >= 0) ? par_src[1].at(idx1) : (src_ctype)0; + + op(access_idx, src00, src10, src01, src11); + } +}; + +template +struct OpCallerToQ4 { + Op op; + PVisSrc par_src[3]; + PVisDst par_dst[1]; + using src_ctype = typename PVisSrc::CType; + + devfunc void on(uint32_t access_idx) { + int32_t idx0 = par_dst[0].idx(access_idx * 2); + int32_t idx1 = par_dst[0].idx(access_idx * 2 + 1); + src_ctype src00 = (idx0 >= 0) ? par_src[0].at(idx0) : (src_ctype)0; + src_ctype src10 = (idx0 >= 0) ? par_src[1].at(idx0) : (src_ctype)0; + src_ctype src20 = (idx0 >= 0) ? par_src[2].at(idx0) : (src_ctype)0; + src_ctype src01 = (idx0 >= 0) ? par_src[0].at(idx1) : (src_ctype)0; + src_ctype src11 = (idx0 >= 0) ? par_src[1].at(idx1) : (src_ctype)0; + src_ctype src21 = (idx0 >= 0) ? par_src[2].at(idx1) : (src_ctype)0; + + op(access_idx, src00, src10, src20, src01, src11, src21); + } +}; + +//! specialization for arity == 1 +template +struct OpCallerToQ4 { + Op op; + PVisSrc par_src[1]; + PVisDst par_dst[1]; + + devfunc void on(uint32_t access_idx) { + op(access_idx, par_src[0].at(access_idx)); + } +}; +//! specialization for arity == 2 +template +struct OpCallerToQ4 { + Op op; + PVisSrc par_src[2]; + PVisDst par_dst[1]; + + devfunc void on(uint32_t access_idx) { + op(access_idx, par_src[0].at(access_idx), par_src[1].at(access_idx)); + } +}; + +template +struct OpCallerToQ4 { + Op op; + PVisSrc par_src[3]; + PVisDst par_dst[1]; + + devfunc void on(uint32_t access_idx) { + op(access_idx, par_src[0].at(access_idx), par_src[1].at(access_idx), + par_src[2].at(access_idx)); + } +}; + +/* f}}} */ + +template +__global__ void cuda_kern_q4(OpCaller op_caller, uint32_t size) { + uint32_t access_idx = blockIdx.x * blockDim.x + threadIdx.x, + delta = blockDim.x * gridDim.x; + if (access_idx < size) { + op_caller.on(access_idx); + access_idx += delta; + if (access_idx < size) { + op_caller.on(access_idx); + access_idx += delta; + if (access_idx < size) { + op_caller.on(access_idx); + } + } + } +} + +/* f{{{ UserOpInvoker specializations */ + +//! run op by promoting all params to same ndim +template +class UserOpInvokerQ4 { + const ElemwiseOpParamN& m_src_param; + const ElemwiseOpParamN<1>& m_dst_param; + cudaStream_t m_stream; + const Op& m_op; + + void dispatch0() { + switch (m_dst_param.max_ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch1(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) +#undef cb + } + on_bad_ndim(m_dst_param.max_ndim); + } + + template + void dispatch1() { + using PVisSrc = typename std::conditional< + BetweenQ4, ParamVectVisitor, + ParamElemVisitor>::type; + + typedef OpCallerToQ4, + BetweenQ4> + Caller; + + size_t size = m_dst_param[0].layout.access_bytes(); + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern_q4; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par_src[i].host_init(m_src_param[i], grid_size, block_size); + caller.par_dst[0].host_init(m_dst_param[0], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } + +public: + UserOpInvokerQ4(const ElemwiseOpParamN& src_param, + const ElemwiseOpParamN<1>& dst_param, cudaStream_t stream, + const Op& op) + : m_src_param(src_param), + m_dst_param(dst_param), + m_stream(stream), + m_op(op) { + dispatch0(); + } +}; +#endif +/* f}}} */ + +#undef devfunc +} // namespace elemwise_intl + +template +void run_elemwise(const ElemwiseOpParamN& src_param, + const ElemwiseOpParamN<1>& dst_param, cudaStream_t stream, + const Op& op = Op()); +#if MEGDNN_CC_CUDA + +template +void run_elemwise(const ElemwiseOpParamN& src_param, + const ElemwiseOpParamN<1>& dst_param, cudaStream_t stream, + const Op& op) { + src_param.assert_initialized(); + dst_param.assert_initialized(); + // TODO: Maybe 2bit? + megdnn_assert(dst_param[0].layout.dtype.is_low_bit()); + megdnn_assert(dst_param[0].layout.is_contiguous()); + + elemwise_intl::UserOpInvokerQ4::value>( + src_param, dst_param, stream, op); +} + +#define INST_RUN_ELEMWISE_LOWBIT(Op, src_ctype, dst_ctype, arity) \ + template void run_elemwise( \ + const ElemwiseOpParamN&, const ElemwiseOpParamN<1>&, \ + cudaStream_t, const Op&) +#endif + +} // namespace cuda +} // namespace megdnn + +// vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/src/cuda/elemwise_multi_type/kern_impl_q4.inl b/dnn/src/cuda/elemwise_multi_type/kern_impl_q4.inl new file mode 100644 index 0000000000000000000000000000000000000000..f73f119b69810d91677e3ef962ce710f44522dbd --- /dev/null +++ b/dnn/src/cuda/elemwise_multi_type/kern_impl_q4.inl @@ -0,0 +1,39 @@ +/** + * \file dnn/src/cuda/elemwise_multi_type/kern_impl_q4.inl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once + +#ifndef KERN_IMPL_MODE +#error "KERN_IMPL_MODE, KERN_IMPL_ARITY, KERN_IMPL_STYPE, KERN_IMPL_DTYPE must be defined" +#endif + +#include "src/cuda/elemwise_multi_type/kern_ops.cuh" + +namespace megdnn { +namespace cuda { + +#define cb(_m) \ + typedef ElemwiseKern \ + KernImpl; \ + typedef kern_ops_quantized::QuantizedMultiTypeOp< \ + KERN_IMPL_ARITY, KERN_IMPL_STYPE, KERN_IMPL_DTYPE, KernImpl> \ + Op; \ + INST_RUN_ELEMWISE_LOWBIT(Op, KERN_IMPL_STYPE, KERN_IMPL_DTYPE, \ + KERN_IMPL_ARITY); + +KERN_IMPL_MODE(cb) + +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh b/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh index 9dc9322c5beb6fac765b97845791834f7968cad3..658f12d899a29cb2a0b360bbf4f253d2464dee3c 100644 --- a/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh +++ b/dnn/src/cuda/elemwise_multi_type/kern_ops.cuh @@ -6,11 +6,13 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once #include "src/cuda/elemwise_helper.cuh" +#include "src/cuda/elemwise_helper_q4.cuh" #include "src/cuda/elemwise_multi_type/kern.cuh" #include "src/cuda/utils.cuh" @@ -127,10 +129,10 @@ struct QuantizedMultiTypeOp; template struct QuantizedMultiTypeOp< 1, ctype_src, ctype_dst, KernImpl, - typename std::enable_if< - std::is_same::value || - std::is_same::value || - std::is_same::value>::type> { + typename std::enable_if<(std::is_same::value || + std::is_same::value || + std::is_same::value) && + IsNotTypeQ4::value>::type> { ctype_dst* dst; CudaDTypeParam dst_param; CudaDTypeParam param_a; @@ -173,10 +175,10 @@ struct QuantizedMultiTypeOp< template struct QuantizedMultiTypeOp< 2, ctype_src, ctype_dst, KernImpl, - typename std::enable_if< - std::is_same::value || - std::is_same::value || - std::is_same::value>::type> { + typename std::enable_if<(std::is_same::value || + std::is_same::value || + std::is_same::value) && + IsNotTypeQ4::value>::type> { ctype_dst* dst; CudaDTypeParam dst_param; CudaDTypeParam param_a, param_b; @@ -224,10 +226,10 @@ struct QuantizedMultiTypeOp< template struct QuantizedMultiTypeOp< 3, ctype_src, ctype_dst, KernImpl, - typename std::enable_if< - std::is_same::value || - std::is_same::value || - std::is_same::value>::type> { + typename std::enable_if<(std::is_same::value || + std::is_same::value || + std::is_same::value) && + IsNotTypeQ4::value>::type> { ctype_dst* dst; CudaDTypeParam dst_param; CudaDTypeParam param_a, param_b, param_c; @@ -277,6 +279,367 @@ struct QuantizedMultiTypeOp< #endif }; +template +struct QuantizedMultiTypeOp< + 1, ctype_src, ctype_dst, KernImpl, + typename std::enable_if::value && + IsNotTypeQ4::value>::type> { + ctype_dst* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + ctype_dst* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ ctype_dst apply(ctype_src v1) { + float fv1 = param_a.dequantize(v1); + float rv = KernImpl::apply(fv1); + return dst_param.quantize(rv); + } + + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src a) { + dst[idx] = dst_param.quantize(KernImpl::apply(param_a.dequantize(a))); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 2, ctype_src, ctype_dst, KernImpl, + typename std::enable_if::value && + IsNotTypeQ4::value>::type> { + ctype_dst* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a, param_b; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + ctype_dst* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + param_b = src_params[1]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ ctype_dst apply(ctype_src v1, ctype_src v2) { + float fv1 = param_a.dequantize(v1), fv2 = param_b.dequantize(v2); + float rv = KernImpl::apply(fv1, fv2); + return dst_param.quantize(rv); + } + + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src a, + ctype_src b) { + dst[idx] = dst_param.quantize( + KernImpl::apply(param_a.dequantize(a), param_b.dequantize(b))); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 1, ctype_src, ctype_dst, KernImpl, + typename std::enable_if::value && + IsTypeQ4::value>::type> { + using src_storage = + typename elemwise_intl::VectTypeTrait::Storage; + using dst_storage = + typename elemwise_intl::VectTypeTrait::Storage; + dst_storage* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a; + static constexpr bool src_signedness = + std::is_same::value; + typedef typename elemwise_intl::VectTypeTrait::vect_type + src_vect_type; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + dst_storage* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ dst_storage apply(src_storage v1) { + float fv1 = param_a.dequantize(v1); + float rv = KernImpl::apply(fv1); + return dst_param.quantize(rv).as_storage(); + } + + __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a) { + dst_storage x = apply( + src_storage(unpack_integer_4bits(a.x, 0))); + dst_storage y = apply( + src_storage(unpack_integer_4bits(a.x, 4))); + + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 1, ctype_src, ctype_dst, KernImpl, + typename std::enable_if<(std::is_same::value || + std::is_same::value || + std::is_same::value) && + IsTypeQ4::value>::type> { + using dst_storage = + typename elemwise_intl::VectTypeTrait::Storage; + dst_storage* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + dst_storage* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ dst_storage apply(ctype_src v1) { + float fv1 = param_a.dequantize(v1); + float rv = KernImpl::apply(fv1); + return dst_param.quantize(rv).as_storage(); + } + + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src a_x, + ctype_src a_y) { + dst_storage x = apply(a_x), y = apply(a_y); + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 2, ctype_src, ctype_dst, KernImpl, + typename std::enable_if::value && + IsTypeQ4::value>::type> { + using src_storage = + typename elemwise_intl::VectTypeTrait::Storage; + using dst_storage = + typename elemwise_intl::VectTypeTrait::Storage; + dst_storage* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a, param_b; + static constexpr bool src_signedness = + std::is_same::value; + typedef typename elemwise_intl::VectTypeTrait::vect_type + src_vect_type; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + dst_storage* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + param_b = src_params[1]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ dst_storage apply(src_storage v1, + src_storage v2) { + float fv1 = param_a.dequantize(v1), fv2 = param_b.dequantize(v2); + float rv = KernImpl::apply(fv1, fv2); + return dst_param.quantize(rv).as_storage(); + } + + __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, + src_vect_type b) { + src_storage a_x = + src_storage(unpack_integer_4bits(a.x, 0)); + src_storage a_y = + src_storage(unpack_integer_4bits(a.x, 4)); + src_storage b_x = + src_storage(unpack_integer_4bits(b.x, 0)); + src_storage b_y = + src_storage(unpack_integer_4bits(b.x, 4)); + + dst_storage x = apply(a_x, b_x), y = apply(a_y, b_y); + + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 2, ctype_src, ctype_dst, KernImpl, + typename std::enable_if<(std::is_same::value || + std::is_same::value || + std::is_same::value) && + IsTypeQ4::value>::type> { + using dst_storage = + typename elemwise_intl::VectTypeTrait::Storage; + dst_storage* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a, param_b; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + dst_storage* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + param_b = src_params[1]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ dst_storage apply(ctype_src v1, ctype_src v2) { + float fv1 = param_a.dequantize(v1), fv2 = param_b.dequantize(v2); + float rv = KernImpl::apply(fv1, fv2); + return dst_param.quantize(rv).as_storage(); + } + + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src a_x, + ctype_src b_x, ctype_src a_y, + ctype_src b_y) { + dst_storage x = apply(a_x, b_x), y = apply(a_y, b_y); + + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 3, ctype_src, ctype_dst, KernImpl, + typename std::enable_if::value && + IsTypeQ4::value>::type> { + using src_storage = + typename elemwise_intl::VectTypeTrait::Storage; + using dst_storage = + typename elemwise_intl::VectTypeTrait::Storage; + dst_storage* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a, param_b, param_c; + static constexpr bool src_signedness = + std::is_same::value; + typedef typename elemwise_intl::VectTypeTrait::vect_type + src_vect_type; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + dst_storage* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + param_b = src_params[1]; + param_c = src_params[2]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ dst_storage apply(src_storage v1, src_storage v2, + src_storage v3) { + float fv1 = param_a.dequantize(v1), fv2 = param_b.dequantize(v2), + fv3 = param_c.dequantize(v3); + float rv = KernImpl::apply(fv1, fv2, fv3); + return dst_param.quantize(rv).as_storage(); + } + + __device__ __forceinline__ void operator()(uint32_t idx, src_vect_type a, + src_vect_type b, + src_vect_type c) { + src_storage a_x = + src_storage(unpack_integer_4bits(a.x, 0)); + src_storage a_y = + src_storage(unpack_integer_4bits(a.x, 4)); + src_storage b_x = + src_storage(unpack_integer_4bits(b.x, 0)); + src_storage b_y = + src_storage(unpack_integer_4bits(b.x, 4)); + src_storage c_x = + src_storage(unpack_integer_4bits(c.x, 0)); + src_storage c_y = + src_storage(unpack_integer_4bits(c.x, 4)); + + dst_storage x = apply(a_x, b_x, c_x), y = apply(a_y, b_y, c_y); + + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y); + } +#endif +}; + +template +struct QuantizedMultiTypeOp< + 3, ctype_src, ctype_dst, KernImpl, + typename std::enable_if<(std::is_same::value || + std::is_same::value || + std::is_same::value) && + IsTypeQ4::value>::type> { + using dst_storage = + typename elemwise_intl::VectTypeTrait::Storage; + dst_storage* dst; + CudaDTypeParam dst_param; + CudaDTypeParam param_a, param_b, param_c; + typedef typename elemwise_intl::VectTypeTrait::vect_type + dst_vect_type; + +#if !MEGDNN_CC_CUDA + QuantizedMultiTypeOp( + const SmallVector>& src_params, + dst_storage* dst, const CudaDTypeParam& dst_param) + : dst{dst}, dst_param{dst_param} { + param_a = src_params[0]; + param_b = src_params[1]; + param_c = src_params[2]; + } +#endif + +#if MEGDNN_CC_CUDA + __device__ __forceinline__ dst_storage apply(ctype_src v1, ctype_src v2, + ctype_src v3) { + float fv1 = param_a.dequantize(v1), fv2 = param_b.dequantize(v2), + fv3 = param_c.dequantize(v3); + float rv = KernImpl::apply(fv1, fv2, fv3); + return dst_param.quantize(rv).as_storage(); + } + + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src a_x, + ctype_src b_x, ctype_src c_x, + ctype_src a_y, ctype_src b_y, + ctype_src c_y) { + dst_storage x = apply(a_x, b_x, c_x), y = apply(a_y, b_y, c_y); + + *(dst_vect_type*)(&dst[idx]) = + elemwise_intl::VectTypeTrait::make_vector(x, y); + } +#endif +}; + } // namespace kern_ops_quantized } // namespace cuda diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..d42113874312be35c41f2bb13cd38d57bc6cdb1d Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..2bc037a9376655ceb1ad9fe27ae4a7e11f64da28 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ABS_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..a979c47fc682d872be7542b47e58603cb649b23b Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf3b910326c074d06e8d07fd5412fb1bb13c71c6 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..7a2bb2f4bc543c40cb3e08a37f7b5e4becbc31ba Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..8f3f5dd464bd996d6b864c13dae75cded03001ad Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..323dd929d6ad872fd33c8d9365fdfddedd37c80c Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..d063e24979b31394988eca1248523131de21fcb2 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ADD_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..d844ef6e0950a7c6f7089bec84adc2270ebb4f0a Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..18d6c377ec3a6f0021ccc8406ebb078e4a4c7566 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/CEIL_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..625774e00bf22a35f9941e013acfdbee95d091c1 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..9ecba9c7ce1ea9af9379b38b5b5d2a327231397c Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/COND_LEQ_MOV_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..4186b04b1fc8ceee55ad30f0614f4d6d8221b4eb Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..64a314ad09a344c7c14cc4d4b3a87009132b6fa9 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/EQ_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..be7a40fd0dafa0737c3115ddf11c11c6ddb0da02 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..df42e885d9cb2030076d27eb6e37423253d67a60 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..a463825e1cb8dcd5f1bf568b6dc9cfcf19579c9e Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..1e558b17549441b41ab48e06ae412ad2bcd7a502 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..8503017f5ffa006e77073325f43aa12166cb4821 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..cc94da23198bb09ae0e26cef3a4d6057270dca28 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FAST_TANH_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..2fc921201749e8bf9795d8aaf86b8f110628c243 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..f16f555e0f5354251fb84f4a0f23d1999b9df6a7 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FLOOR_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..8a68e05812589545b081ef7483681f0c81e6b346 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..5f118c811918dfa88426787524d467c79b33a78c Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..1cdd04fcf73418251798738b9ea7eb58dcf37a7a Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..43afad0a4b3fabfe8c48a2edf2adbffb00c93140 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..3df745a69d3a963702aebc925a113684e04a4acc Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..b32ae7dd8acda7f92d953c8ea15f36f4a0f6e66a Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_H_SWISH_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..6a72e8fea6e678d5db85b502c6da118e1ba2e116 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..19f4984e623c004be194a04d75272c8e360adf28 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..bcc6c93f8b8059678043ded1b76343c9ca9a296d Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..1bbe4b65cca6a88c4d7874f68c22ae999befd26a Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..0ed56d05991af9034f47104676693cfdf6271f1e Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..5c626d4c95183a5e96873f7db84e5727b99a53df Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_RELU_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..af01bb606013173493f5cbea4027efba8173b8a7 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..1a989ece2b07e4eeb5bbe1a9cabe6c8aec3ba37a Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..47192b8e79d132327798138702c9ff177fbad7b8 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..37f77a1a77dca5d24a80fd1c2bb474ace4dd2d5b Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..1a975d24db10611a765151ad09b54f5e70ba0aa4 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..b3baefabd62d2d6a032d8c28586a5e336e903489 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_SIGMOID_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..6dd395e0f2bb5b0e4fb910e2dc6be150bff8605a Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..98142053500d169fcd757c9f1df39b668385f315 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..6c86b0a90c3392e2bcf0844b286294696fc29a6e Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..4340a282755f49c43d12abf9e57dfc95f7562340 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..484641d25abcea8bfe3af9bdc8ce8ead044b5410 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..66066de041b103dfcf3097dcea65579bae5101a0 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_ADD_TANH_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..0314a2ff76cc38fd1bc2bc6d0f790c4c67113ec8 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..2b16977d9748ca69a98f9639a3048bee0b86d36d Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/FUSE_MUL_ADD3_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..13c9dc2cbe918fd211490a9e43fdef24a0e5accc Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..889fac16d7cfeeb805321a9d0fcd5c4ffe7af5ef Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..d8d61570aafcd48069305f44dbee993e2f9b4b73 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..9d1a414fb75900cb1914fc72703672f6aba637e7 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..e0641f28d9e55e84cffbf2b3d96f9dd10004aaad Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..0052c5a8bebc2baed62d32c03c24bb947f043f05 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/H_SWISH_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..71d2f8d72ad05ea5607d2ccc750cbad643a8d5d6 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..0e255830de848bc6514a75f834528f64282f2d63 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/LEQ_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..196cce34170b7147c98bd6d46863312b5a30abb3 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..6832349d774e29173da5a9e5998ba2223523f300 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/LT_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..68cdbd8a7db3113001bfe495e2b65e0f4d81d905 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..6fc236b1a80eb63b94c1ec0769d2b28902e44713 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/MAX_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..1fd5c6466aa73c1346dd1978fc156bd84d1a14a0 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..a82e56d7cb373382399adff1206b88e5214fa5b5 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/MIN_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..29c309847ee08c08cd01dfe9e43635f788229923 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..9ecf4ad1c281477f3b41b6ea840c9190729c89a5 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/MUL_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..353ad75bcd88ffe1691f36582376d0ef7c6846b1 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..cbdb016e20f3baab69acabe89e9aa699482d2f08 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/NEGATE_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..d3e611fa43d3bfd7a6a5acb4f876f231b849a155 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..c8a08155dcc2a65db58d55e469fc675c5e6a41cc Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..67199f82fcfc39288eb75a3abe9afde2a4a8339f Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..a8e0ad31b4b25e9dc0bbb3472f94fe914d9bab91 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..10a6bf7642a5f0798ac9aec857a65d4ee163118c Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..106d00426e6254c9bceeb2eb8fa77716b3d8280f Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/RELU_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..fd2b758460801b31b5f822b7c6e668c0c1f86b41 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..d2096352f9791a8c1e98cf8f9e6f0a508a41c201 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/ROUND_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..f809012ac99cec62c41160591b17b18f867b9d63 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..8540d498099d7ee513686f3c85419f42469bebe5 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..a4b8c4e52e7394ff95a2f778336278b60112fc2d Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..08f23b9089acdf8c2a594f3fe6fc839f1476366b Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..6788a3edb91dbff6b9de53ed27c391a0cff19b71 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..4a043116ec1a67bb74e1a7b67312bc9b0e60a12f Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SIGMOID_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..c9013edaad248632b2cd143083f76d11ee223b68 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..c7633d5186ec8662af6b8ac7334d1615b55cc712 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SUB_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..528a26dab4785ed74d1dc38c9fc1730c5a8138a5 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..b6ac9ffd4abc22edd9362c592728d26d02feaa6c Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/SWITCH_GT0_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..b09d8b67db9681fb6c251eebf76aff4221896cb4 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..b1663ff22bfc74cb582400620651be22014a22ec Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint32_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..506b9aceab3c548b123bdc4fd88f5fda8319d158 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint4_dt_qint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint4_dt_qint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..54c62553b662e0542ec5383718517726094f6fdf Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_qint4_dt_qint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_quint4_dt_qint32.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_quint4_dt_qint32.cu new file mode 100644 index 0000000000000000000000000000000000000000..96c6c801733a4c68e7ff2c301dd5938d4f518c7a Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_quint4_dt_qint32.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_quint4_dt_quint4.cu b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_quint4_dt_quint4.cu new file mode 100644 index 0000000000000000000000000000000000000000..208e4564a2c80ea2cb20a9c684c63267a478aa53 Binary files /dev/null and b/dnn/src/cuda/elemwise_multi_type/kimpl/TANH_dt_quint4_dt_quint4.cu differ diff --git a/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp b/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp index 9108a4c852c01d8c5d2c2d1ddb46248f7756fd95..5346d27b007a77a8d00d72df7f32fd1a281e7b43 100644 --- a/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp +++ b/dnn/src/cuda/elemwise_multi_type/opr_impl.cpp @@ -167,6 +167,7 @@ struct ModeDispatcher; param_enumv::Elemwise::Mode::_m, float>; \ using Op = kern_ops_quantized::QuantizedMultiTypeOp< \ arity, src_ctype, dst_ctype, KernImpl>; \ + dst_ctype* dst = dst_tensor.ptr(); \ Op op(src_params, dst, dst_param); \ return run_elemwise(param, stream, op); \ } while (0); @@ -178,7 +179,8 @@ struct ModeDispatcher; using src_ctype = _src_ctype; \ using dst_ctype = _dst_ctype; \ static void run( \ - const ElemwiseOpParamN<_arity>& param, _dst_ctype* dst, \ + const ElemwiseOpParamN<_arity>& param, \ + const TensorND& dst_tensor, \ const SmallVector>& src_params, \ const CudaDTypeParam<_dst_ctype>& dst_param, \ param::Elemwise::Mode mode, cudaStream_t stream) { \ @@ -211,6 +213,8 @@ IMPL_MODE_DISPATCHER(3, dt_qint8, dt_qint8); MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) IMPL_MODE_DISPATCHER(1, dt_qint8, dt_qint32); IMPL_MODE_DISPATCHER(1, dt_qint32, dt_qint8); +IMPL_MODE_DISPATCHER(1, dt_qint4, dt_qint32); +IMPL_MODE_DISPATCHER(1, dt_quint4, dt_qint32); #undef FOREACH #define FOREACH(cb) \ @@ -221,6 +225,88 @@ IMPL_MODE_DISPATCHER(1, dt_qint32, dt_qint8); MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) IMPL_MODE_DISPATCHER(2, dt_qint8, dt_qint32); IMPL_MODE_DISPATCHER(2, dt_qint32, dt_qint8); +IMPL_MODE_DISPATCHER(2, dt_qint4, dt_qint32); +IMPL_MODE_DISPATCHER(2, dt_quint4, dt_qint32); +#undef FOREACH + +#undef _cb_dispatch_mode + +#define _cb_dispatch_mode(_m) \ + case param::Elemwise::Mode::_m: \ + do { \ + using KernImpl = \ + ElemwiseKern; \ + using Op = kern_ops_quantized::QuantizedMultiTypeOp< \ + arity, src_ctype, dst_ctype, KernImpl>; \ + using dst_storage = typename VectTypeTrait::Storage; \ + dst_storage* dst = \ + reinterpret_cast(dst_tensor.raw_ptr); \ + Op op(src_params, dst, dst_param); \ + ElemwiseOpParamN<1> param_dst; \ + param_dst[0] = dst_tensor; \ + param_dst.init_from_given_tensor(); \ + run_elemwise(param, param_dst, \ + stream, op); \ + return; \ + } while (0); + +#define FOREACH(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ABS, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(NEGATE, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(CEIL, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FLOOR, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ROUND, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) +IMPL_MODE_DISPATCHER(1, dt_qint4, dt_qint4); +IMPL_MODE_DISPATCHER(1, dt_quint4, dt_quint4); +#undef FOREACH + +#define FOREACH(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MAX, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MIN, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(MUL, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SUB, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SWITCH_GT0, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LT, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(LEQ, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(EQ, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) +IMPL_MODE_DISPATCHER(2, dt_qint4, dt_qint4); +IMPL_MODE_DISPATCHER(2, dt_quint4, dt_quint4); +#undef FOREACH + +#define FOREACH MEGDNN_FOREACH_ELEMWISE_MODE_TERNARY_FLOAT +IMPL_MODE_DISPATCHER(3, dt_qint4, dt_qint4); +IMPL_MODE_DISPATCHER(3, dt_quint4, dt_quint4); +#undef FOREACH + +#define FOREACH(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(SIGMOID, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FAST_TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(H_SWISH, cb) +IMPL_MODE_DISPATCHER(1, dt_qint32, dt_qint4); +IMPL_MODE_DISPATCHER(1, dt_qint32, dt_quint4); +#undef FOREACH + +#define FOREACH(cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(ADD, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_RELU, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_SIGMOID, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_TANH, cb) \ + MEGDNN_ELEMWISE_MODE_ENABLE(FUSE_ADD_H_SWISH, cb) +IMPL_MODE_DISPATCHER(2, dt_qint32, dt_qint4); +IMPL_MODE_DISPATCHER(2, dt_qint32, dt_quint4); #undef FOREACH #undef _cb_dispatch_mode @@ -235,8 +321,7 @@ void dispatch_src_ctype(const ElemwiseOpParamN<1>&, const TensorND& dst_tensor, auto param_a = param[0].layout.dtype.param(); \ auto dst_param = dst_tensor.layout.dtype.param<_dt>(); \ ModeDispatcher<1, ctype_src, typename DTypeTrait<_dt>::ctype>::run( \ - param, dst_tensor.ptr::ctype>(), \ - {param_a}, dst_param, mode, stream); \ + param, dst_tensor, {param_a}, dst_param, mode, stream); \ break; \ } @@ -262,6 +347,38 @@ void dispatch_src_ctype(const ElemwiseOpParamN<1>& param, typedef dt_qint32 ctype_src; switch (dst_tensor.layout.dtype.enumv()) { DISPATCH(dtype::QuantizedS8); + DISPATCH(dtype::QuantizedS4); + DISPATCH(dtype::Quantized4Asymm); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<1>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_qint4 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::QuantizedS4); + DISPATCH(dtype::QuantizedS32); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<1>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_quint4 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::Quantized4Asymm); + DISPATCH(dtype::QuantizedS32); default: megdnn_throw(ssprintf( "Unsupported output dtype %s for ElemwiseMultiType", @@ -277,8 +394,8 @@ void dispatch_src_ctype(const ElemwiseOpParamN<1>& param, auto param_b = param[1].layout.dtype.param(); \ auto dst_param = dst_tensor.layout.dtype.param<_dt>(); \ ModeDispatcher<2, ctype_src, typename DTypeTrait<_dt>::ctype>::run( \ - param, dst_tensor.ptr::ctype>(), \ - {param_a, param_b}, dst_param, mode, stream); \ + param, dst_tensor, {param_a, param_b}, dst_param, mode, \ + stream); \ break; \ } @@ -308,12 +425,45 @@ void dispatch_src_ctype(const ElemwiseOpParamN<2>& param, typedef dt_qint32 ctype_src; switch (dst_tensor.layout.dtype.enumv()) { DISPATCH(dtype::QuantizedS8); + DISPATCH(dtype::QuantizedS4); + DISPATCH(dtype::Quantized4Asymm); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<2>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_qint4 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::QuantizedS4); + DISPATCH(dtype::QuantizedS32); default: megdnn_throw(ssprintf( "Unsupported output dtype %s for ElemwiseMultiType", dst_tensor.layout.dtype.name())); } } + +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<2>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_quint4 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::Quantized4Asymm); + DISPATCH(dtype::QuantizedS32); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + #undef DISPATCH #define DISPATCH(_dt) \ @@ -323,8 +473,8 @@ void dispatch_src_ctype(const ElemwiseOpParamN<2>& param, auto param_c = param[2].layout.dtype.param(); \ auto dst_param = dst_tensor.layout.dtype.param<_dt>(); \ ModeDispatcher<3, ctype_src, typename DTypeTrait<_dt>::ctype>::run( \ - param, dst_tensor.ptr::ctype>(), \ - {param_a, param_b, param_c}, dst_param, mode, stream); \ + param, dst_tensor, {param_a, param_b, param_c}, dst_param, \ + mode, stream); \ break; \ } @@ -346,6 +496,34 @@ void dispatch_src_ctype(const ElemwiseOpParamN<3>& param, } } +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<3>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_qint4 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::QuantizedS4); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + +template <> +void dispatch_src_ctype(const ElemwiseOpParamN<3>& param, + const TensorND& dst_tensor, + Elemwise::Mode mode, cudaStream_t stream) { + typedef dt_quint4 ctype_src; + switch (dst_tensor.layout.dtype.enumv()) { + DISPATCH(dtype::Quantized4Asymm); + default: + megdnn_throw(ssprintf( + "Unsupported output dtype %s for ElemwiseMultiType", + dst_tensor.layout.dtype.name())); + } +} + #undef DISPATCH } // namespace @@ -355,8 +533,10 @@ void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<1>& param, Elemwise::Mode mode) { megdnn_assert( param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS8 || - param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS32, - "expect inputs dtype to be qint8/qint32, but got: %s", + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS32 || + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS4 || + param[0].layout.dtype.enumv() == DTypeEnum::Quantized4Asymm, + "expect inputs dtype to be qint8/qint32/q4, but got: %s", param[0].layout.dtype.name()); auto stream = cuda_stream(this->handle()); switch (param[0].layout.dtype.enumv()) { @@ -369,6 +549,8 @@ void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<1>& param, DISPATCH(dtype::QuantizedS8); DISPATCH(dtype::QuantizedS32); + DISPATCH(dtype::QuantizedS4); + DISPATCH(dtype::Quantized4Asymm); default: megdnn_throw( @@ -386,8 +568,10 @@ void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<2>& param, param[1].layout.dtype.enumv()); megdnn_assert( param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS8 || - param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS32, - "expect inputs dtype to be qint8/qint32, but got: %s", + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS32 || + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS4 || + param[0].layout.dtype.enumv() == DTypeEnum::Quantized4Asymm, + "expect inputs dtype to be qint8/qint32/q4, but got: %s", param[0].layout.dtype.name()); auto stream = cuda_stream(this->handle()); switch (param[0].layout.dtype.enumv()) { @@ -400,6 +584,8 @@ void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<2>& param, DISPATCH(dtype::QuantizedS8); DISPATCH(dtype::QuantizedS32); + DISPATCH(dtype::QuantizedS4); + DISPATCH(dtype::Quantized4Asymm); default: megdnn_throw( @@ -419,8 +605,10 @@ void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<3>& param, param[2].layout.dtype.enumv()); megdnn_assert( - param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS8, - "expect inputs dtype to be qint8, but got: %s", + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS8 || + param[0].layout.dtype.enumv() == DTypeEnum::QuantizedS4 || + param[0].layout.dtype.enumv() == DTypeEnum::Quantized4Asymm, + "expect inputs dtype to be qint8/q4, but got: %s", param[0].layout.dtype.name()); auto stream = cuda_stream(this->handle()); switch (param[0].layout.dtype.enumv()) { @@ -432,6 +620,8 @@ void ElemwiseMultiTypeImpl::on_quantized_mode(const ElemwiseOpParamN<3>& param, } DISPATCH(dtype::QuantizedS8); + DISPATCH(dtype::QuantizedS4); + DISPATCH(dtype::Quantized4Asymm); default: megdnn_throw( diff --git a/dnn/src/cuda/relayout/kern.cu b/dnn/src/cuda/relayout/kern.cu index a87ecf2239c209af12ace61e2378cc63c6882968..16dd3d540587e488fb52fb6be1863092070d0e89 100644 --- a/dnn/src/cuda/relayout/kern.cu +++ b/dnn/src/cuda/relayout/kern.cu @@ -22,23 +22,30 @@ void copy_noncontig_general(const TensorND &dst, const TensorND &src, cudaStream param[0] = dst; param[1] = src; -#define RUN(_dt) \ +#define RUN(_dt, body) \ do { \ typedef DTypeTrait::ctype ctype; \ - param[0].layout.dtype = param[1].layout.dtype = dtype::_dt(); \ + body; \ param.init_from_given_tensor(); \ param.assert_initialized(); \ noncontig_general_intl::UserOpInvoker(param, stream); \ return; \ } while (0) - switch (dst.layout.dtype.size()) { - case 1: - RUN(Byte); - case 2: - RUN(Float16); - case 4: - RUN(Int32); + if (!dst.layout.dtype.is_low_bit()) { + switch (dst.layout.dtype.size()) { + case 1: + RUN(Byte, param[0].layout.dtype = param[1].layout.dtype = + dtype::Byte()); + case 2: + RUN(Float16, param[0].layout.dtype = param[1].layout.dtype = + dtype::Float16()); + case 4: + RUN(Int32, param[0].layout.dtype = param[1].layout.dtype = + dtype::Int32()); + } + } else { + RUN(Quantized4Asymm, ); } megdnn_assert(0, "bad dtype size"); } diff --git a/dnn/src/cuda/relayout/kern.cuh b/dnn/src/cuda/relayout/kern.cuh index 2ca1d21182e46a6b161512da4823c066b3250803..3a119bd24c488d1159bdee9c45edbace6583dc97 100644 --- a/dnn/src/cuda/relayout/kern.cuh +++ b/dnn/src/cuda/relayout/kern.cuh @@ -267,6 +267,129 @@ public: #undef DEFINE_CONTIG_RECEIVER +#define ON(access_idx, par0, par1) \ + { \ + int32_t idx0 = par0.idx(access_idx * 2); \ + int32_t idx1 = par0.idx(access_idx * 2 + 1); \ + Storage x = (idx0 >= 0) ? par1.at(idx0) : (Storage)0; \ + Storage y = (idx1 >= 0) ? par1.at(idx1) : (Storage)0; \ + Storage dst = par0.make_vector(x, y).x; \ + Storage* ptr = par0.ptr(); \ + int32_t offset = par0.offset_from_access(access_idx * 2) >> 1; \ + ptr[offset] = dst; \ + } + +template +__global__ void cuda_kern_general_q4(OpCaller op_caller, uint32_t size) { + uint32_t access_idx = blockIdx.x * blockDim.x + threadIdx.x, + delta = blockDim.x * gridDim.x; + using Storage = uint8_t; + if (access_idx < size) { + ON(access_idx, op_caller.par0, op_caller.par1); + access_idx += delta; + if (access_idx < size) { + ON(access_idx, op_caller.par0, op_caller.par1); + access_idx += delta; + if (access_idx < size) { + ON(access_idx, op_caller.par0, op_caller.par1); + } + } + } +} + +#undef ON + +#define DEFINE_CONTIG_RECEIVER(_ndim, _cb_header, _cb_dispatch) \ + _cb_header(_ndim) { return _cb_dispatch(_ndim, CONTIG_OTHER); } + +template <> +class UserOpInvoker { + bool m_invoked; + const ElemwiseOpParamN<2>& m_param; + cudaStream_t m_stream; + size_t m_rw_size; + + void dispatch0() { + switch (m_param[0].layout.ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch1_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) +#undef cb + } + } + +#define cb_header(ndim) void dispatch1_##ndim() +#define cb_dispatch(ndim, contig_mask) \ + dispatch2>() + DEFINE_CONTIG_RECEIVER(1, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(2, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(3, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(4, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(5, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(6, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(7, cb_header, cb_dispatch) +#undef cb_header +#undef cb_dispatch + + template + void dispatch2() { + switch (m_param[1].layout.ndim) { +#define cb(ndim) \ + case ndim: \ + return dispatch3_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) +#undef cb + } + } + +#define cb_header(ndim) \ + template \ + void dispatch3_##ndim() +#define cb_dispatch(ndim, contig_mask) \ + do_run>() + DEFINE_CONTIG_RECEIVER(1, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(2, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(3, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(4, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(5, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(6, cb_header, cb_dispatch) + DEFINE_CONTIG_RECEIVER(7, cb_header, cb_dispatch) +#undef cb_header +#undef cb_dispatch + + int count = 0; + template + void do_run() { + megdnn_assert(!m_invoked); + m_invoked = true; + typedef OpCallerBinaryNoContiguous Caller; + size_t size = m_param[0].layout.access_bytes(); + int grid_size, block_size; + + Caller caller; + auto param_host_init = [&]() { + caller.par0.host_init(m_param[0], grid_size, block_size); + caller.par1.host_init(m_param[1], grid_size, block_size); + }; + //! general + auto fptr = cuda_kern_general_q4; + elemwise_intl::get_launch_spec(reinterpret_cast(fptr), + size, &grid_size, &block_size); + param_host_init(); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } + +public: + UserOpInvoker(const ElemwiseOpParamN<2>& param, cudaStream_t stream) + : m_rw_size(param.size), m_param(param), m_stream(stream) { + m_invoked = false; + dispatch0(); + megdnn_assert(m_invoked); + } +}; + /* f}}} */ #endif diff --git a/dnn/src/cuda/relayout/opr_impl.cpp b/dnn/src/cuda/relayout/opr_impl.cpp index 1e1adfdc6f74a0db88bcf482ab8678b933c49730..64da13e7d4d305eb6fa46b68910755172b2e9125 100644 --- a/dnn/src/cuda/relayout/opr_impl.cpp +++ b/dnn/src/cuda/relayout/opr_impl.cpp @@ -29,6 +29,8 @@ RelayoutForwardImpl::Param::Param(const TensorND &src, const TensorND &dst, } bool RelayoutForwardImpl::Param::try_transpose() { + if (m_dst.layout.dtype.is_low_bit()) + return false; relayout::TransposeParam transp; bool trans = relayout::is_transpose(m_src.layout, m_dst.layout, transp); if (!trans) @@ -81,10 +83,15 @@ bool RelayoutForwardImpl::Param::try_copy_contig() { return false; if (lsrc.stride[0] != 1 || ldst.stride[0] != 1) return false; - cuda_check(cudaMemcpyAsync( - m_dst.raw_ptr, m_src.raw_ptr, - ldst.total_nr_elems() * dtype_size(), - cudaMemcpyDeviceToDevice, m_opr->stream())); + size_t copy_size; + if (ldst.dtype.is_low_bit()) { + copy_size = ldst.access_bytes(); + } else { + copy_size = ldst.total_nr_elems() * dtype_size(); + } + + cuda_check(cudaMemcpyAsync(m_dst.raw_ptr, m_src.raw_ptr, copy_size, + cudaMemcpyDeviceToDevice, m_opr->stream())); return true; } @@ -107,6 +114,8 @@ bool RelayoutForwardImpl::Param::try_copy_2d(bool cross_dev) { if (lsrc.ndim > 2 || ldst.ndim > 2) return false; + if (ldst.dtype.is_low_bit()) + return false; if (ldst.ndim == 1 && lsrc.ndim == 1) { megdnn_assert(ldst.stride[0] != 1 || lsrc.stride[0] != 1); @@ -147,6 +156,8 @@ bool RelayoutForwardImpl::Param::try_copy_2d(bool cross_dev) { }; bool RelayoutForwardImpl::Param::try_copy_last_contig() { + if (m_dst.layout.dtype.is_low_bit()) + return false; //! check if the last stride is contiguous auto gcd = [](size_t a, size_t b) { if (a > b) std::swap(a, b); diff --git a/dnn/src/cuda/relayout/param_visitor.cpp b/dnn/src/cuda/relayout/param_visitor.cpp index 288bff8bdda7a96ddb4b60dd175cb2acba94c492..73fab4c3f8f88190efbc47fd618b2524439ee85c 100644 --- a/dnn/src/cuda/relayout/param_visitor.cpp +++ b/dnn/src/cuda/relayout/param_visitor.cpp @@ -21,8 +21,7 @@ namespace cuda { #pragma GCC diagnostic ignored "-Warray-bounds" template void ParamElemVisitor::host_init( - - const TensorND &rv, int /*grid_size*/, int /*block_size*/) { + const TensorND& rv, int /*grid_size*/, int /*block_size*/) { megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim); m_ptr = rv.ptr(); for (size_t i = 0; i < rv.layout.ndim; ++i) { @@ -66,6 +65,43 @@ INST_FOR_CTYPE #undef INST_FOR_CTYPE #undef INST +template +void ParamElemVisitor::host_init( + const TensorND& rv, int /*grid_size*/, int /*block_size*/) { + megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim); + m_ptr = reinterpret_cast(rv.raw_ptr); + for (size_t i = 0; i < rv.layout.ndim; ++i) { + m_stride[i] = rv.layout.stride[i]; + m_shape[i] = rv.layout.shape[i]; + if (i + 1 < rv.layout.ndim) { + m_shape_highdim[i] = rv.layout.shape[i + 1]; + if (rv.layout.stride[i + 1] == 1) + m_align_shape_highdim[i] = + (uint32_t)round_up((int)rv.layout.shape[i + 1], 2); + else + m_align_shape_highdim[i] = rv.layout.shape[i + 1]; + } + } + for (size_t i = rv.layout.ndim - 1; i < ndim - 1; ++i) { + m_shape_highdim[i] = 1; + m_align_shape_highdim[i] = 1; + } + for (size_t i = rv.layout.ndim; i < ndim; ++i) { + m_stride[i] = 0; + m_shape[i] = 1; + } + m_is_physical_contiguous = rv.layout.is_physical_contiguous(); + m_is_contiguous = rv.layout.is_contiguous(); +} + +#define INST(ndim, ctg) template class ParamElemVisitor +#define ndim_cb(_ndim) INST(_ndim, CONTIG_OTHER); + +MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb) + +#undef ndim_cb +#undef INST + } // namespace cuda } // namespace megdnn // vim: ft=cpp syntax=cpp.doxygen diff --git a/dnn/src/cuda/relayout/param_visitor.cuh b/dnn/src/cuda/relayout/param_visitor.cuh index 6db08acf6acaf2fa669c47238f9da73dc5401872..a44fb1b4d90b1eacf541c8e356a436c394bf900f 100644 --- a/dnn/src/cuda/relayout/param_visitor.cuh +++ b/dnn/src/cuda/relayout/param_visitor.cuh @@ -122,6 +122,119 @@ public: #undef PARAM_ELEM_VISITOR_COMMON_DEV +template +class ParamElemVisitor { + using Storage = uint8_t; + +protected: + Storage* __restrict m_ptr; + int m_stride[ndim]; + int m_shape[ndim]; + bool m_is_contiguous; + bool m_is_physical_contiguous; + + //! m_shape_highdim[i] = original_shape[i + 1] +#ifdef _MSC_VER + Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1]; + Uint32Fastdiv m_align_shape_highdim[ndim > 1 ? ndim - 1 : 1]; +#else + Uint32Fastdiv m_shape_highdim[ndim]; + Uint32Fastdiv m_align_shape_highdim[ndim]; +#endif + +public: + static const Storage kMask = 0xf; + static const Storage kBits = 4; + static const int NDIM = ndim; + void host_init(const TensorND& rv, int grid_size, int block_size); + +#if MEGDNN_CC_CUDA + devfunc void thread_init(uint32_t) {} + + devfunc void next() {} + + devfunc void get_shape_from_access(uint32_t access_idx, + int (&shape_idx)[ndim]) { +#pragma unroll + for (int i = ndim - 1; i >= 1; --i) { + Uint32Fastdiv& align_shp = m_align_shape_highdim[i - 1]; + uint32_t access_idx_div = access_idx / align_shp; + shape_idx[i] = access_idx - access_idx_div * align_shp.divisor(); + access_idx = access_idx_div; + } + shape_idx[0] = access_idx; + } + + devfunc int offset(uint32_t idx) { + int offset = 0; +#pragma unroll + for (int i = ndim - 1; i >= 1; --i) { + Uint32Fastdiv& shp = m_shape_highdim[i - 1]; + uint32_t idx_div = idx / shp; + offset += (idx - idx_div * shp.divisor()) * m_stride[i]; + idx = idx_div; + } + offset += idx * m_stride[0]; + return offset; + } + + devfunc int offset_from_access(uint32_t access_idx) { + int offset = 0; + if (m_is_contiguous) { + offset = access_idx; + } else { + int shape_idx[ndim]; + get_shape_from_access(access_idx, shape_idx); + #pragma unroll + for (int i = ndim - 1; i >= 0; --i) { + offset += shape_idx[i] * m_stride[i]; + } + } + return offset; + } + + devfunc int idx(uint32_t access_idx) { + int idx = 0; + if (m_is_physical_contiguous) { + idx = access_idx; + } else { + int shape_idx[ndim]; + bool valid = true; + get_shape_from_access(access_idx, shape_idx); +#pragma unroll + for (int i = 0; i < ndim; ++i) { + valid &= (shape_idx[i] < m_shape[i]); + } +#pragma unroll + for (int i = 0; i < ndim - 1; ++i) { + idx = (idx + shape_idx[i]) * m_shape[i + 1]; + } + idx = valid ? idx + shape_idx[ndim - 1] : -1; + } + return idx; + } + + devfunc Storage* ptr() { return m_ptr; } + + devfunc Storage at(uint32_t idx) { + int offset_ = offset(idx); + int vec_idx = offset_ >> 1; + int lane_idx = offset_ & 0x1; + + Storage item = Storage(unpack_integer_4bits( + *(Storage*)&m_ptr[vec_idx], lane_idx * 4)); + + return item; + } + + using rwtype = typename elemwise_intl::VectTypeTrait::vect_type; + + devfunc rwtype make_vector(Storage x, Storage y) { + return elemwise_intl::VectTypeTrait::make_vector(x, y); + } +#endif +}; + } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/type_cvt/kern.cu b/dnn/src/cuda/type_cvt/kern.cu index 675aad99fc4cbed723cb2b5c58c8b0066cda4cb9..349770aec9789da44540665f9d5657aa00065027 100644 --- a/dnn/src/cuda/type_cvt/kern.cu +++ b/dnn/src/cuda/type_cvt/kern.cu @@ -12,6 +12,7 @@ #include "./kern.cuh" #include "megdnn/dtype.h" #include "src/cuda/elemwise_helper.cuh" +#include "src/cuda/elemwise_helper_q4.cuh" using namespace megdnn; using namespace cuda; @@ -137,9 +138,9 @@ struct TypeCvtOpFromQuantized< template struct TypeCvtOpBetweenQuantized< ctype_dest, ctype_src, - typename std::enable_if< - std::is_same::value || - std::is_same::value>::type> { + typename std::enable_if<(std::is_same::value || + std::is_same::value) && + IsNotTypeQ4::value>::type> { ctype_dest* dest; CudaDTypeParam src_param; CudaDTypeParam dst_param; @@ -162,6 +163,76 @@ struct TypeCvtOpBetweenQuantized< VectTypeTrait::make_vector(x, y, z, w); } }; + +template +struct TypeCvtOpFromNormalToQuantized4bit { + CudaDTypeParam dst_param; + using dst_vect_type = typename VectTypeTrait::vect_type; + using dst_storage = typename VectTypeTrait::Storage; + dst_storage* dest; + __device__ __forceinline__ dst_storage apply(ctype_src in) { + return dst_param.quantize(in).as_storage(); + } + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src src_x, + ctype_src src_y) { + dst_storage x = apply(src_x); + dst_storage y = apply(src_y); + + *(dst_vect_type*)(&dest[idx]) = + VectTypeTrait::make_vector(x, y); + } +}; + +template +struct TypeCvtOpFromQuantizedToQuantized4bit { + CudaDTypeParam src_param; + CudaDTypeParam dst_param; + using dst_vect_type = typename VectTypeTrait::vect_type; + using dst_storage = typename VectTypeTrait::Storage; + dst_storage* dest; + __device__ __forceinline__ dst_storage apply(ctype_src in) { + float inter = src_param.dequantize(in); + return dst_param.quantize(inter).as_storage(); + } + __device__ __forceinline__ void operator()(uint32_t idx, ctype_src src_x, + ctype_src src_y) { + dst_storage x = apply(src_x); + dst_storage y = apply(src_y); + + *(dst_vect_type*)(&dest[idx]) = + VectTypeTrait::make_vector(x, y); + } +}; + +template +struct TypeCvtOpFromQuantizedToQuantized4bit< + ctype_dest, ctype_src, + typename std::enable_if::value>::type> { + static constexpr bool src_signedness = + std::is_same::value; + CudaDTypeParam src_param; + CudaDTypeParam dst_param; + using src_vect_type = typename VectTypeTrait::vect_type; + using dst_vect_type = typename VectTypeTrait::vect_type; + using src_storage = typename VectTypeTrait::Storage; + using dst_storage = typename VectTypeTrait::Storage; + dst_storage* dest; + __device__ __forceinline__ dst_storage apply(src_storage in) { + float inter = src_param.dequantize(in); + return dst_param.quantize(inter).as_storage(); + } + __device__ __forceinline__ void operator()(uint32_t idx, + src_vect_type src) { + dst_storage x = apply( + src_storage(unpack_integer_4bits(src.x, 0))); + dst_storage y = apply( + src_storage(unpack_integer_4bits(src.x, 4))); + + *(dst_vect_type*)(&dest[idx]) = + VectTypeTrait::make_vector(x, y); + } +}; + } // anonymous namespace #define main_func(OpType, body) \ @@ -237,6 +308,7 @@ void typecvt_kern_n2n(const TensorND& dest, const TensorND& src, template void typecvt_kern_n2n( \ const TensorND& dest, const TensorND& src, cudaStream_t stream); +// clang-format off #define MEGDNN_FOREACH_COMPUTING_DTYPE_WITH_DTYPE_SRC(dtype_src, cb) \ cb(dtype_src, dt_int8) \ cb(dtype_src, dt_int32) \ @@ -250,7 +322,7 @@ void typecvt_kern_n2n(const TensorND& dest, const TensorND& src, #define MEGDNN_FOREACH_QUANTIZED_DTYPE_WITH_DTYPE_SRC(dtype_src, cb) \ cb(dtype_src, dt_quint8) \ cb(dtype_src, dt_qint32) \ - cb(dtype_src, dt_qint8) \ + cb(dtype_src, dt_qint8) \ #define INST_SRC_QUANTIZED(dtype_src) \ MEGDNN_FOREACH_COMPUTING_DTYPE_WITH_DTYPE_SRC(dtype_src, INST_Q2N) \ @@ -273,15 +345,85 @@ void typecvt_kern_n2n(const TensorND& dest, const TensorND& src, #define MEGDNN_FOREACH_QUANTIZED_CTYPE(cb) \ cb(dt_quint8) \ cb(dt_qint32) \ - cb(dt_qint8) + cb(dt_qint8) \ + cb(dt_qint4) \ + cb(dt_quint4) MEGDNN_FOREACH_QUANTIZED_CTYPE(INST_SRC_QUANTIZED) MEGDNN_FOREACH_COMPUTING_CTYPE(INST_SRC_NORMAL) +// clang-format on template void typecvt_kern_n2q( const TensorND& src, const TensorND& dst, const CudaDTypeParam& param, cudaStream_t stream); +#define main_func_to_q4(OpType, body) \ + { \ + typedef typename DTypeTrait::ctype ctype_src; \ + typedef typename DTypeTrait::ctype ctype_dest; \ + typedef OpType Op; \ + ElemwiseOpParamN<1> param_src; \ + ElemwiseOpParamN<1> param_dst; \ + param_src[0] = src; \ + param_dst[0] = dest; \ + param_src.init_from_given_tensor(); \ + param_dst.init_from_given_tensor(); \ + megdnn_assert(DTypeTrait::enumv == \ + src.layout.dtype.enumv().ev); \ + megdnn_assert(DTypeTrait::enumv == \ + dest.layout.dtype.enumv().ev); \ + using dst_storage = typename VectTypeTrait::Storage; \ + Op op; \ + op.dest = reinterpret_cast(dest.raw_ptr); \ + body; \ + run_elemwise(param_src, param_dst, \ + stream, op); \ + return; \ + } + +template +void typecvt_kern_q2q4(const TensorND& dest, const TensorND& src, + const CudaDTypeParam& src_param, + const CudaDTypeParam& dst_param, + cudaStream_t stream) { + main_func_to_q4(TypeCvtOpFromQuantizedToQuantized4bit, + op.dst_param = dst_param; + op.src_param = src_param;) +} + +template +void typecvt_kern_n2q4(const TensorND& dest, const TensorND& src, + const CudaDTypeParam& dst_param, + cudaStream_t stream) { + main_func_to_q4(TypeCvtOpFromNormalToQuantized4bit, + op.dst_param = dst_param;) +} + +#define INST_Q2Q4(dtype_src, dtype_dest) \ + template void typecvt_kern_q2q4( \ + const TensorND& dest, const TensorND& src, \ + const CudaDTypeParam& src_param, \ + const CudaDTypeParam& dst_param, cudaStream_t stream); + +#define INST_N2Q4(dtype_src, dtype_dest) \ + template void typecvt_kern_n2q4( \ + const TensorND& dest, const TensorND& src, \ + const CudaDTypeParam& dst_param, cudaStream_t stream); + +// clang-format off +#define MEGDNN_FOREACH_QUANTIZED_LOWBIT_WITH_DTYPE_SRC(dtype_src, cb) \ + cb(dtype_src, dt_qint4) \ + cb(dtype_src, dt_quint4) \ + +#define INST_SRC_QUANTIZED_LOWBIT(dtype_src) \ + MEGDNN_FOREACH_QUANTIZED_LOWBIT_WITH_DTYPE_SRC(dtype_src, INST_Q2Q4) \ + +#define INST_SRC_NORMAL_LOWBIT(dtype_src) \ + MEGDNN_FOREACH_QUANTIZED_LOWBIT_WITH_DTYPE_SRC(dtype_src, INST_N2Q4) \ + +MEGDNN_FOREACH_QUANTIZED_CTYPE(INST_SRC_QUANTIZED_LOWBIT) +MEGDNN_FOREACH_COMPUTING_CTYPE(INST_SRC_NORMAL_LOWBIT) + } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/type_cvt/kern.cuh b/dnn/src/cuda/type_cvt/kern.cuh index a5c98a4a55daf3fdc313ae38e4a43eed007b76e2..517e0bd208f06d817ac5fd47fcd18fd222ea03ee 100644 --- a/dnn/src/cuda/type_cvt/kern.cuh +++ b/dnn/src/cuda/type_cvt/kern.cuh @@ -38,6 +38,17 @@ void typecvt_kern_q2q( const CudaDTypeParam& dst_param, cudaStream_t stream); +template +void typecvt_kern_n2q4(const TensorND& dest, const TensorND& src, + const CudaDTypeParam& param, + cudaStream_t stream); + +template +void typecvt_kern_q2q4(const TensorND& dest, const TensorND& src, + const CudaDTypeParam& src_param, + const CudaDTypeParam& dst_param, + cudaStream_t stream); + } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/type_cvt/opr_impl.cpp b/dnn/src/cuda/type_cvt/opr_impl.cpp index e4332ee7eaaf7c5c077fb20c57a1996ea02d79b4..b514a1772726fdcc804ac6d583814085ac9c0e01 100644 --- a/dnn/src/cuda/type_cvt/opr_impl.cpp +++ b/dnn/src/cuda/type_cvt/opr_impl.cpp @@ -26,8 +26,9 @@ void exec_src_quantized( cudaStream_t stream) { bool is_dst_quantized = dst.layout.dtype.category() == DTypeCategory::QUANTIZED; + bool is_dst_lowbit = dst.layout.dtype.is_low_bit(); using ctype_src = typename DTypeTrait::ctype; - if (!is_dst_quantized) { + if (!is_dst_quantized && !is_dst_lowbit) { switch (dst.layout.dtype.enumv()) { #define cb(_dt) \ case DTypeTrait<_dt>::enumv: { \ @@ -40,7 +41,7 @@ void exec_src_quantized( megdnn_assert_internal(0); #undef cb } - } else { + } else if (!is_dst_lowbit) { switch (dst.layout.dtype.enumv()) { #define cb(_dt) \ case DTypeTrait<_dt>::enumv: { \ @@ -53,6 +54,21 @@ void exec_src_quantized( MEGDNN_FOREACH_QUANTIZED_DTYPE(cb); default: megdnn_assert_internal(0); +#undef cb + } + } else { + switch (dst.layout.dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + auto dst_param = dst.layout.dtype.param<_dt>(); \ + using ctype_dest = typename DTypeTrait<_dt>::ctype; \ + typecvt_kern_q2q4(dst, src, src_param, \ + dst_param, stream); \ + return; \ + } + MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb); + default: + megdnn_assert_internal(0); #undef cb } } @@ -63,8 +79,9 @@ void exec_src_normal(const TensorND& dst, const TensorND& src, cudaStream_t stream) { bool is_dst_quantized = dst.layout.dtype.category() == DTypeCategory::QUANTIZED; + bool is_dst_lowbit = dst.layout.dtype.is_low_bit(); using ctype_src = typename DTypeTrait::ctype; - if (!is_dst_quantized) { + if (!is_dst_quantized && !is_dst_lowbit) { switch (dst.layout.dtype.enumv()) { #define cb(_dt) \ case DTypeTrait<_dt>::enumv: { \ @@ -78,7 +95,7 @@ void exec_src_normal(const TensorND& dst, const TensorND& src, default: megdnn_assert_internal(0); } - } else { + } else if (!is_dst_lowbit) { switch (dst.layout.dtype.enumv()) { #define cb(_dt) \ case DTypeTrait<_dt>::enumv: { \ @@ -88,9 +105,23 @@ void exec_src_normal(const TensorND& dst, const TensorND& src, return; \ } MEGDNN_FOREACH_QUANTIZED_DTYPE(cb); +#undef cb default: megdnn_assert_internal(0); + } + } else { + switch (dst.layout.dtype.enumv()) { +#define cb(_dt) \ + case DTypeTrait<_dt>::enumv: { \ + auto dst_param = dst.layout.dtype.param<_dt>(); \ + using ctype_dest = typename DTypeTrait<_dt>::ctype; \ + typecvt_kern_n2q4(dst, src, dst_param, stream); \ + return; \ + } + MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb); #undef cb + default: + megdnn_assert_internal(0); } } } @@ -101,6 +132,7 @@ void TypeCvtImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { bool is_src_quantized = src.layout.dtype.category() == DTypeCategory::QUANTIZED; auto stream = cuda_stream(handle()); + if (!is_src_quantized) switch (src.layout.dtype.enumv()) { #define cb(_dt) \ @@ -123,6 +155,7 @@ void TypeCvtImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) { return; \ } MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) + MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb) #undef cb default: megdnn_assert_internal(0); diff --git a/dnn/src/naive/elemwise_multi_type/opr_impl.cpp b/dnn/src/naive/elemwise_multi_type/opr_impl.cpp index e94c0376b5c0de42d6c7d851b9d3e9508d97122a..5c1513af8a2bf787b7876b51246ef8f603b1d59a 100644 --- a/dnn/src/naive/elemwise_multi_type/opr_impl.cpp +++ b/dnn/src/naive/elemwise_multi_type/opr_impl.cpp @@ -187,7 +187,8 @@ void ElemwiseMultiTypeImpl::dispatch_add_qint_op( auto iA = iter_a; auto pD = dst; for (size_t i = 0; i < size; i++) { - *pD = dst_param.quantize(KernImpl::apply(param0.dequantize(*iA))); + src_ctype a = *iA; + *pD = dst_param.quantize(KernImpl::apply(param0.dequantize(a))); ++iA; ++pD; } @@ -215,8 +216,10 @@ void ElemwiseMultiTypeImpl::dispatch_add_qint_op( auto iB = iter_b; auto pD = dst; for (size_t i = 0; i < size; i++) { - *pD = dst_param.quantize(KernImpl::apply(param0.dequantize(*iA), - param1.dequantize(*iB))); + src_ctype a = *iA; + src_ctype b = *iB; + *pD = dst_param.quantize(KernImpl::apply(param0.dequantize(a), + param1.dequantize(b))); ++iA; ++iB; ++pD; @@ -250,9 +253,12 @@ void ElemwiseMultiTypeImpl::dispatch_add_qint_op( auto iC = iter_c; auto pD = dst; for (size_t i = 0; i < size; i++) { - *pD = dst_param.quantize(KernImpl::apply(param0.dequantize(*iA), - param1.dequantize(*iB), - param2.dequantize(*iC))); + src_ctype a = *iA; + src_ctype b = *iB; + src_ctype c = *iC; + *pD = dst_param.quantize(KernImpl::apply(param0.dequantize(a), + param1.dequantize(b), + param2.dequantize(c))); ++iA; ++iB; ++iC; @@ -292,6 +298,7 @@ void ElemwiseMultiTypeImpl::dispatch_qint_op_dtype(const ElemParam& param, ElemParam>(param, dst); \ break; MEGDNN_FOREACH_QUANTIZED_DTYPE(cb) + MEGDNN_FOREACH_QUANTIZED_LOWBIT_DTYPE(cb) #undef cb default: diff --git a/dnn/test/cuda/elemwise_multi_type.cpp b/dnn/test/cuda/elemwise_multi_type.cpp index 0bc0fd33b06c9b1d9ea96415c87a6428b5f142b7..b57037b56fff29719715d7d65cb829c3c063eee5 100644 --- a/dnn/test/cuda/elemwise_multi_type.cpp +++ b/dnn/test/cuda/elemwise_multi_type.cpp @@ -38,7 +38,7 @@ static void run_test(int arity, Checker& checker, Mode mode) for (auto type : std::vector>{ {dtype::QuantizedS8(1.4f), dtype::QuantizedS8(1.7f)}, {dtype::QuantizedS8(1.4f), dtype::QuantizedS32(0.1f)}, - {dtype::QuantizedS32(0.1f), dtype::QuantizedS8(0.4f)} + {dtype::QuantizedS32(0.1f), dtype::QuantizedS8(0.4f)}, }) { if (type.first.enumv() == DTypeEnum::QuantizedS32 || type.second.enumv() == DTypeEnum::QuantizedS32) { @@ -102,6 +102,64 @@ static void run_test(int arity, Checker& checker, Mode mode) } } +static void run_test_q4(int arity, Checker& checker, + Mode mode) { + for (auto type : std::vector>{ + {dtype::QuantizedS4(1.4f), dtype::QuantizedS4(1.7f)}, + {dtype::Quantized4Asymm(8, 1.4f), + dtype::Quantized4Asymm(8, 1.7f)}, + {dtype::QuantizedS4(1.4f), dtype::QuantizedS32(0.1f)}, + {dtype::QuantizedS32(0.1f), dtype::QuantizedS4(0.4f)}}) { + if (type.first.enumv() == DTypeEnum::QuantizedS32 || + type.second.enumv() == DTypeEnum::QuantizedS32) { + if (mode != Mode::QRELU && mode != Mode::QH_SWISH && + mode != Mode::QSIGMOID && mode != Mode::QTANH && + mode != Mode::QFAST_TANH && mode != Mode::QADD && + mode != Mode::QFUSE_ADD_RELU && + mode != Mode::QFUSE_ADD_SIGMOID && + mode != Mode::QFUSE_ADD_TANH && + mode != Mode::QFUSE_ADD_H_SWISH) { + return; + } + } + checker.set_param(mode); + UniformIntRNG rng_int4{-7, 7}; + UniformIntRNG rng_uint4{0, 15}; + UniformIntRNG rng_int32{INT16_MIN >> 1, INT16_MAX >> 1}; + + auto set_rng = [&](DType dtype, size_t i) { + if (dtype.enumv() == DTypeEnum::QuantizedS4) { + checker.set_rng(i, &rng_int4); + } else if (dtype.enumv() == DTypeEnum::Quantized4Asymm) { + checker.set_rng(i, &rng_uint4); + } else { + megdnn_assert(dtype.enumv() == DTypeEnum::QuantizedS32); + checker.set_rng(i, &rng_int32); + } + checker.set_dtype(i, dtype); + }; + //! As some mode may cause compute error + checker.set_epsilon(1 + 1e-3); + + auto src_type = type.first; + auto dst_type = type.second; + for (int i = 0; i < arity; i++) { + set_rng(src_type, i); + } + set_rng(dst_type, arity); + + if (arity == 1) { + checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}}) + .execs({{1, 4, 5, 5}, {1, 4, 5, 5}}); + } else if (arity == 2) { + checker.execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {3, 4, 5, 6}}) + .execs({{1, 4, 5, 5}, {1, 4, 5, 5}, {1, 4, 5, 5}}); + } else { + megdnn_assert(0); + } + } +} + TEST_F(CUDA, ELEMWISE_QUANTIZED_MODE_UNARY) { Checker checker(handle_cuda()); for (auto mode : @@ -113,6 +171,11 @@ TEST_F(CUDA, ELEMWISE_QUANTIZED_MODE_UNARY) { Mode::QERFCINV, Mode::QH_SWISH}) { run_test(1, checker, mode); } + for (auto mode : {Mode::QRELU, Mode::QABS, Mode::QCEIL, Mode::QFLOOR, + Mode::QNEGATE, Mode::QSIGMOID, Mode::QTANH, + Mode::QFAST_TANH, Mode::QROUND, Mode::QH_SWISH}) { + run_test_q4(1, checker, mode); + } } TEST_F(CUDA, ELEMWISE_QUANTIZED_MODE_BINARY) { @@ -145,6 +208,15 @@ TEST_F(CUDA, ELEMWISE_QUANTIZED_MODE_BINARY) { Mode::QFUSE_ADD_H_SWISH}) { run_test(2, checker, mode); } + for (auto mode : {Mode::QADD, Mode::QMAX, Mode::QMIN, Mode::QMUL, + Mode::QSUB, Mode::QSWITCH_GT0, + + Mode::QLT, Mode::QLEQ, Mode::QEQ, + + Mode::QFUSE_ADD_RELU, Mode::QFUSE_ADD_SIGMOID, + Mode::QFUSE_ADD_TANH, Mode::QFUSE_ADD_H_SWISH}) { + run_test_q4(2, checker, mode); + } } TEST_F(CUDA, ELEMWISE_QUANTIZED_MODE_TENARY) { @@ -152,7 +224,6 @@ TEST_F(CUDA, ELEMWISE_QUANTIZED_MODE_TENARY) { Checker checker(handle_cuda()); for (auto mode : {Mode::QFUSE_MUL_ADD3, Mode::QCOND_LEQ_MOV}) { - printf("Testing mode: %d\n", (int)mode); UniformIntRNG rng_int8{-127, 127}; UniformIntRNG rng_uint8{0, 225}; checker.set_param({mode}) @@ -169,7 +240,17 @@ TEST_F(CUDA, ELEMWISE_QUANTIZED_MODE_TENARY) { .execs({{9}, {9}, {9}, {}}) .execs({{17}, {17}, {17}, {}}) .execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {3, 4, 5, 6}, {}}); - + UniformIntRNG rng_int4{-7, 7}; + checker.set_param({mode}) + .set_rng(0, &rng_int4) + .set_rng(1, &rng_int4) + .set_rng(2, &rng_int4) + .set_dtype(0, dtype::QuantizedS4(1.2f)) + .set_dtype(1, dtype::QuantizedS4(1.6f)) + .set_dtype(2, dtype::QuantizedS4(1.8f)) + .set_dtype(3, dtype::QuantizedS4(1.4f)) + .execs({{3, 4, 5, 6}, {3, 4, 5, 6}, {3, 4, 5, 6}, {}}) + .execs({{1, 4, 5, 5}, {1, 4, 5, 5}, {1, 4, 5, 5}, {}}); } } @@ -373,6 +454,44 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_QUANTIZED_MODE_TENARY) { } } +TEST_F(CUDA, BENCHMARK_ELEMWISE_QUANTIZED_MODE_BINARY_Q4) { + using Mode = ElemwiseMultiType::Param::Mode; + CUBenchmarker bencher(handle_cuda()); + UniformIntRNG rng{-7, 7}; + + for (auto mode : {Mode::QADD, Mode::QFUSE_ADD_RELU, Mode::QFUSE_ADD_SIGMOID, + Mode::QFUSE_ADD_TANH, Mode::QFUSE_ADD_H_SWISH}) { + printf("Benchmark mode: %d\n", (int)mode); + bencher.set_param({mode}) + .set_rng(0, &rng) + .set_rng(1, &rng) + .set_dtype(0, dtype::QuantizedS4(0.1f)) + .set_dtype(1, dtype::QuantizedS4(0.2f)) + .set_dtype(2, dtype::QuantizedS4(0.01f)); + size_t nr_times = 50; + bencher.set_times(nr_times); + auto run_bench = [&](size_t N, size_t C, size_t H, size_t W) { + printf("(NxCxHxW)=(%zux%zux%zux%zu)\n", N, C, H, W); + auto time = + bencher.execs({{N, C, H, W}, {N, C, H, W}, {N, C, H, W}}) / + nr_times; + printf("time = %.2f, bandwidth = %.2f GB/s\n", time, + (3.0 * N * C * H * W) / 2 / (time * 1e6)); + + time = bencher.execs({{N, C / 64, H, W, 64}, + {N, C / 64, H, W, 64}, + {N, C / 64, H, W, 64}}) / + nr_times; + printf("time = %.2f, bandwidth = %.2f GB/s\n", time, + (3.0 * N * C * H * W) / 2 / (time * 1e6)); + }; + run_bench(256, 256, 56, 56); + run_bench(64, 256, 56, 56); + run_bench(256, 128, 28, 28); + run_bench(64, 128, 28, 28); + } +} + #endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/dnn/test/cuda/relayout.cpp b/dnn/test/cuda/relayout.cpp index e81e12170b6d6175f2cca6d771998092d135c6d8..5ad2bacf9a9f1af086f12a1e313ceec23c2770a5 100644 --- a/dnn/test/cuda/relayout.cpp +++ b/dnn/test/cuda/relayout.cpp @@ -917,4 +917,48 @@ TEST_F(CUDA, RELAYOUT_TEST) { checker.exec(TensorLayoutArray{arg.src, arg.dst}); } } + +TEST_F(CUDA, RELAYOUT_Q4) { + Checker checker(handle_cuda()); + UniformIntRNG rng_int4{-7, 7}; + checker.set_rng(0, &rng_int4) + .set_rng(1, &rng_int4) + .set_dtype(0, dtype::QuantizedS4(1.f)) + .set_dtype(1, dtype::QuantizedS4(1.f)) + .execs({{1, 64, 15, 15}, {1, 15, 15, 64}}) + .execs({{1, 5, 9, 32}, {1, 5, 32, 9}}) + .execl(TensorLayoutArray{ + {{6400}, {1}, dtype::QuantizedS4{1.f}}, + {{20, 320}, {1024, 1}, dtype::QuantizedS4{1.f}}}) + .execl(TensorLayoutArray{ + {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}}, + {{20, 60, 3}, {256, 4, 1}, dtype::QuantizedS4{1.f}}}) + .execl(TensorLayoutArray{ + {{20, 20, 3, 3}, {256, 12, 4, 1}, dtype::QuantizedS4{1.f}}, + {{1200, 3}, {4, 1}, dtype::QuantizedS4{1.f}}}) + .execl(TensorLayoutArray{{{5, 16, 7, 7, 4}, + {3136, 196, 28, 4, 1}, + dtype::QuantizedS4{1.f}}, + {{5, 16, 7, 7, 4}, + {3136, 4, 448, 64, 1}, + dtype::QuantizedS4{1.f}}}) + .execl(TensorLayoutArray{{{5, 7, 7, 16, 4}, + {3136, 448, 64, 4, 1}, + dtype::QuantizedS4{1.f}}, + {{5, 7, 7, 16, 4}, + {3136, 28, 4, 196, 1}, + dtype::QuantizedS4{1.f}}}) + .execl(TensorLayoutArray{{{5, 2, 7, 7, 32}, + {3136, 1568, 224, 32, 1}, + dtype::QuantizedS4{1.f}}, + {{5, 2, 7, 7, 32}, + {3136, 32, 448, 64, 1}, + dtype::QuantizedS4{1.f}}}) + .execl(TensorLayoutArray{{{5, 7, 7, 2, 32}, + {3136, 448, 64, 32, 1}, + dtype::QuantizedS4{1.f}}, + {{5, 7, 7, 2, 32}, + {3136, 224, 32, 1568, 1}, + dtype::QuantizedS4{1.f}}}); +} // vim: syntax=cpp.doxygen diff --git a/dnn/test/cuda/type_cvt.cpp b/dnn/test/cuda/type_cvt.cpp index 87dc654bf9dbabde0b123f57ff61dabe6511a554..8567c9f75bc05f0940c00ad1f0d71cfc6802506b 100644 --- a/dnn/test/cuda/type_cvt.cpp +++ b/dnn/test/cuda/type_cvt.cpp @@ -106,13 +106,43 @@ TEST_F(CUDA, QUANTIZED_TYPECVT) { run(dtype::Quantized8Asymm(1e-3f, (uint8_t)18), dtype::QuantizedS32(7e-4f)); } +TEST_F(CUDA, QUANTIZED_TYPECVT_4BIT) { + UniformIntRNG int_rng{0, 8}; + Checker checker(handle_cuda()); + checker.set_rng(0, &int_rng).set_rng(1, &int_rng); + + auto set_err = [&](const DType& dst_dtype) { + if (dst_dtype.category() == DTypeCategory::FLOAT) + checker.set_epsilon(1e-6); + else { + checker.set_epsilon(1e-3); + } + }; + + auto run = [&](const DType& src_dtype, const DType& dst_dtype) { + set_err(dst_dtype); + checker.set_dtype(0, src_dtype) + .set_dtype(1, dst_dtype) + .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}); + set_err(src_dtype); + checker.set_dtype(0, dst_dtype) + .set_dtype(1, src_dtype) + .execs({{16, 3, 224, 223}, {16, 3, 224, 223}}); + }; + + run(dtype::Quantized4Asymm{1.19990518f, 8}, + dtype::Quantized8Asymm{1.f, 128}); + run(dtype::QuantizedS4{1.19990518f}, dtype::QuantizedS8{1.19990518f}); + run(dtype::QuantizedS4{1.19990518f}, + dtype::Quantized4Asymm{1.19990518f, 8}); +} + TEST_F(CUDA, TYPE_CVT_BFLOAT16) { Checker checker(handle_cuda()); UniformFloatRNG rng(-20, 20); checker.set_rng(0, &rng); std::vector dtypes = {dtype::Float32(), dtype::Float16(), - dtype::Int32(), dtype::Int16(), - dtype::Int8()}; + dtype::Int32(), dtype::Int16(), dtype::Int8()}; for (auto sdtype : dtypes) { TensorLayout src({10, 10}, sdtype), dst({10, 10}, dtype::BFloat16()); checker.exec(TensorLayoutArray{src, dst}); @@ -180,6 +210,38 @@ TEST_F(CUDA, BENCHMARK_TYPE_CVT) { dtype::Quantized8Asymm(5.f, (uint8_t)(30))}; run(src, dst); } + +TEST_F(CUDA, BENCHMARK_TYPE_CVT_Q4) { + UniformIntRNG rng{-7, 7}; + auto run = [&](TensorLayout src, TensorLayout dst) { + Benchmarker benchmarker(handle_cuda()); + auto&& layout = src; + size_t nr_times = 1000; + benchmarker.set_times(nr_times); + dst.init_contiguous_stride(); + auto used = benchmarker.set_dtype(0, src.dtype) + .set_dtype(1, dst.dtype) + .set_rng(0, &rng) + .execl({src, dst}) / + nr_times; + printf("layout: %s time %.2fms, bandwith: %f GB/s\n", + layout.to_string().c_str(), used, + (1.f * src.access_bytes() + dst.access_bytes()) / (used * 1e6)); + }; + + // NCHW astype(float32) + TensorLayout src = + TensorLayout{{256, 256, 56, 56}, dtype::QuantizedS8(1.f)}; + TensorLayout dst = + TensorLayout{{256, 256, 56, 56}, dtype::QuantizedS4(1.f)}; + + run(src, dst); + + src = TensorLayout{{256, 4, 56, 56, 64}, dtype::QuantizedS4(1.f)}; + dst = TensorLayout{{256, 4, 56, 56, 64}, dtype::QuantizedS8(1.f)}; + run(src, dst); +} + #endif // vim: syntax=cpp.doxygen diff --git a/dnn/test/naive/elemwise_multi_type.cpp b/dnn/test/naive/elemwise_multi_type.cpp index 71681fd4beebe12f53b67db2b7743fec504732a9..1f4c94741b2fa1a1187b1367fff77d6c982309cd 100644 --- a/dnn/test/naive/elemwise_multi_type.cpp +++ b/dnn/test/naive/elemwise_multi_type.cpp @@ -115,8 +115,8 @@ TEST_F(NAIVE, ELEMWISE_QUANTIZED_MODE_UNARY) { auto extra_impl = [&](const TensorNDArray& tensors) { TensorNDArray float_tensors; for (size_t i = 0; i < tensors.size(); ++i) { - auto layout = tensors[i].layout; - layout.dtype = dtype::Float32(); + TensorLayout layout(static_cast(tensors[i].layout), + dtype::Float32()); float_tensors.emplace_back(malloc(layout.span().dist_byte()), std::move(layout)); } @@ -206,8 +206,8 @@ TEST_F(NAIVE, ELEMWISE_QUANTIZED_MODE_BINARY) { auto extra_impl = [&](const TensorNDArray& tensors) { TensorNDArray float_tensors; for (size_t i = 0; i < tensors.size(); ++i) { - auto layout = tensors[i].layout; - layout.dtype = dtype::Float32(); + TensorLayout layout(static_cast(tensors[i].layout), + dtype::Float32()); float_tensors.emplace_back(malloc(layout.span().dist_byte()), std::move(layout)); } @@ -266,8 +266,8 @@ TEST_F(NAIVE, ELEMWISE_QUANTIZED_MODE_TERNARY) { auto extra_impl = [&](const TensorNDArray& tensors) { TensorNDArray float_tensors; for (size_t i = 0; i < tensors.size(); ++i) { - auto layout = tensors[i].layout; - layout.dtype = dtype::Float32(); + TensorLayout layout(static_cast(tensors[i].layout), + dtype::Float32()); float_tensors.emplace_back(malloc(layout.span().dist_byte()), std::move(layout)); }