diff --git a/dnn/src/cuda/elemwise_helper.cpp b/dnn/src/cuda/elemwise_helper.cpp index e64cc593fcc55a38214738328013a9babb9f6168..a285782f760aa07bc3222cdc8ea2302ab486ee2b 100644 --- a/dnn/src/cuda/elemwise_helper.cpp +++ b/dnn/src/cuda/elemwise_helper.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/cuda/elemwise_helper.cuh" @@ -21,7 +22,7 @@ #define _cb_check_ndim(n) megdnn::TensorShape::MAX_NDIM == n || static_assert(MEGDNN_FOREACH_TENSOR_NDIM(_cb_check_ndim) false, - "bad foreach ndim"); + "bad foreach ndim"); #undef _cb_check_ndim namespace megdnn { @@ -32,28 +33,30 @@ namespace elemwise_intl { #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" -template -void ParamElemVisitor::host_init( - const TensorND &rv, int /*grid_size*/, int /*block_size*/) { +template +void ParamElemVisitor::host_init(const TensorND& rv, + int /*grid_size*/, + int /*block_size*/) { megdnn_assert(rv.layout.ndim && rv.layout.ndim <= ndim); m_ptr = rv.ptr(); - for (size_t i = 0; i < rv.layout.ndim; ++ i) { + for (size_t i = 0; i < rv.layout.ndim; ++i) { m_stride[i] = rv.layout.stride[i]; if (i + 1 < rv.layout.ndim) m_shape_highdim[i] = rv.layout.shape[i + 1]; } - for (int i = rv.layout.ndim - 1; i < ndim - 1; ++ i) { + for (int i = rv.layout.ndim - 1; i < ndim - 1; ++i) { m_shape_highdim[i] = 1; } - for (int i = rv.layout.ndim; i < ndim; ++ i) { + for (int i = rv.layout.ndim; i < ndim; ++i) { m_stride[i] = 0; } } #pragma GCC diagnostic pop template -void ParamElemVisitor<3, ctype, BCAST_101>::host_init( - const TensorND& rv, int grid_size, int block_size) { +void ParamElemVisitor<3, ctype, BCAST_101>::host_init(const TensorND& rv, + int grid_size, + int block_size) { uint32_t shape2, shape1; int stride1; if (rv.layout.ndim == 3) { @@ -74,8 +77,8 @@ void ParamElemVisitor<3, ctype, BCAST_101>::host_init( template void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv, - int grid_size, - int block_size) { + int grid_size, + int block_size) { megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]); m_ptr = rv.ptr(); m_stride1 = rv.layout.stride[1]; @@ -85,8 +88,8 @@ void ParamElemVisitor<2, ctype, BCAST_10>::host_init(const TensorND& rv, template void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv, - int grid_size, - int block_size) { + int grid_size, + int block_size) { megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[1]); m_ptr = rv.ptr(); m_stride0 = rv.layout.stride[0]; @@ -94,9 +97,10 @@ void ParamElemVisitor<2, ctype, BCAST_01>::host_init(const TensorND& rv, rv.layout.shape[1]); } -template -void ParamElemVisitor<1, ctype, BCAST_FULL>::host_init( - const TensorND &rv, int /*grid_size*/, int /*block_size*/) { +template +void ParamElemVisitor<1, ctype, BCAST_FULL>::host_init(const TensorND& rv, + int /*grid_size*/, + int /*block_size*/) { megdnn_assert(rv.layout.ndim == NDIM && !rv.layout.stride[0]); m_ptr = rv.ptr(); } @@ -119,14 +123,13 @@ void ParamVectVisitor<4, ctype, BCAST_1010>::host_init(const TensorND& rv, } #define INST(ndim, ctype, brd) template class ParamElemVisitor -#define INST_FOR_CTYPE \ +#define INST_FOR_CTYPE \ MEGDNN_FOREACH_TENSOR_NDIM(ndim_cb) \ - INST(3, ct, BCAST_101); \ - INST(2, ct, BCAST_10); \ - INST(2, ct, BCAST_01); \ + INST(3, ct, BCAST_101); \ + INST(2, ct, BCAST_10); \ + INST(2, ct, BCAST_01); \ INST(1, ct, BCAST_FULL); - #define ndim_cb(_ndim) INST(_ndim, ct, BCAST_OTHER); #define ct dt_byte @@ -175,11 +178,10 @@ INST(dt_qint8); INST(dt_quint8); #undef dt_ibyte -} // namespace elemwise_intl - +} // namespace elemwise_intl -void elemwise_intl::get_launch_spec( - const void *kern, size_t size, int *grid_size, int *block_size) { +void elemwise_intl::get_launch_spec(const void* kern, size_t size, + int* grid_size, int* block_size) { safe_size_in_kern(size); auto config = query_launch_config_for_kernel(kern); *block_size = config.block_size; @@ -202,11 +204,8 @@ void elemwise_intl::get_launch_spec( void elemwise_intl::on_bad_ndim(int ndim) { megdnn_throw(ssprintf("invalid ndim: %d", ndim)); MEGDNN_MARK_USED_VAR(ndim); - } -} // namespace cuda -} // namespace megdnn - +} // namespace cuda +} // namespace megdnn // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} - diff --git a/dnn/src/cuda/elemwise_helper.cuh b/dnn/src/cuda/elemwise_helper.cuh index d9c5e5d169ba90542800ed52c6ec346adba360b2..47ed82d01111fcdcc0f6b60c69c5a7378eca9041 100644 --- a/dnn/src/cuda/elemwise_helper.cuh +++ b/dnn/src/cuda/elemwise_helper.cuh @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once @@ -28,56 +29,56 @@ namespace cuda { namespace elemwise_intl { #define devfunc __device__ __forceinline__ - /*! - * \brief get cuda launch specs for element-wise kernel - * \param kern kernel function address - * \param size total size of elements - */ - void get_launch_spec( - const void *kern, size_t size, int *grid_size, int *block_size); - - MEGDNN_NORETURN void on_bad_ndim(int ndim); - - /*! - * \brief broadcast type - * BCAST_x[0]x[1]...: x[i] == !stride[i] - */ - enum BcastType { - BCAST_OTHER, - BCAST_1010, - BCAST_101, - BCAST_10, - BCAST_01, - BCAST_FULL - }; +/*! + * \brief get cuda launch specs for element-wise kernel + * \param kern kernel function address + * \param size total size of elements + */ +void get_launch_spec(const void* kern, size_t size, int* grid_size, + int* block_size); - /*! - * \brief read and write type trait for byte width integer type - */ - template - class VectTypeTrait; +MEGDNN_NORETURN void on_bad_ndim(int ndim); - struct __attribute__((aligned(8))) half4 { - dt_float16 x, y, z, w; - }; +/*! + * \brief broadcast type + * BCAST_x[0]x[1]...: x[i] == !stride[i] + */ +enum BcastType { + BCAST_OTHER, + BCAST_1010, + BCAST_101, + BCAST_10, + BCAST_01, + BCAST_FULL +}; - __device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y, - dt_float16 z, dt_float16 w) { - half4 t; - t.x = x, t.y = y, t.z = z, t.w = w; - return t; - } +/*! + * \brief read and write type trait for byte width integer type + */ +template +class VectTypeTrait; + +struct __attribute__((aligned(8))) half4 { + dt_float16 x, y, z, w; +}; + +__device__ __forceinline__ half4 make_half4(dt_float16 x, dt_float16 y, + dt_float16 z, dt_float16 w) { + half4 t; + t.x = x, t.y = y, t.z = z, t.w = w; + return t; +} - struct __attribute__((aligned(8))) bhalf4 { - dt_bfloat16 x, y, z, w; - }; +struct __attribute__((aligned(8))) bhalf4 { + dt_bfloat16 x, y, z, w; +}; - __device__ __forceinline__ bhalf4 make_bhalf4(dt_bfloat16 x, dt_bfloat16 y, - dt_bfloat16 z, dt_bfloat16 w) { - bhalf4 t; - t.x = x, t.y = y, t.z = z, t.w = w; - return t; - } +__device__ __forceinline__ bhalf4 make_bhalf4(dt_bfloat16 x, dt_bfloat16 y, + dt_bfloat16 z, dt_bfloat16 w) { + bhalf4 t; + t.x = x, t.y = y, t.z = z, t.w = w; + return t; +} #define INST(_ctype, _vect_type) \ template <> \ @@ -94,245 +95,245 @@ namespace elemwise_intl { } \ } #define as_raw(x) x - INST(dt_int8, char4); - INST(dt_uint8, uchar4); - INST(dt_float32, float4); - INST(dt_float16, half4); - INST(dt_bfloat16, bhalf4); - INST(dt_int32, int4); - INST(dt_int16, short4); +INST(dt_int8, char4); +INST(dt_uint8, uchar4); +INST(dt_float32, float4); +INST(dt_float16, half4); +INST(dt_bfloat16, bhalf4); +INST(dt_int32, int4); +INST(dt_int16, short4); #undef as_raw #define as_raw(x) x.as_int8() - INST(dt_qint8, char4); +INST(dt_qint8, char4); #undef as_raw #define as_raw(x) x.as_uint8() - INST(dt_quint8, uchar4); +INST(dt_quint8, uchar4); #undef as_raw #define as_raw(x) x.as_int32() - INST(dt_qint32, int4); +INST(dt_qint32, int4); #undef as_raw #undef INST - /*! - * \brief visitor to access an elemeent in a tensor at given logic index - * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait) - * \tparam brdcast_mask bit mask for broadcast of params; (i.e. stride[i] is - * 0 iff (brdcast_mask & (1<<(ndim-1-i))) is 1. - * - * host interface: - * void host_init( - * const TensorND &tensor, int grid_size, int block_size) - * - * device interface: - * void thread_init(uint32_t idx) - * called on thread entrance, with logical indexing; the index may - * go beyond buffer range - * - * ctype* ptr() - * return buffer pointer; can be used by specialized OpCaller - * - * void next() - * called before moving to next chunk on each thread - * - * int offset(uint32_t idx) - * get physical offset from logical index - * - * ctype& at(uint32_t idx) - * ptr()[offset(idx)] - * - */ - template - class ParamElemVisitor; - - /*! - * \brief visitor to access vector element in a tensor at given logic index - * \tparam ctype same as ParamElemVisitor, vect_type packed vector type of - * element ctype (i.e. vect_type in VectTypeTrait) \tparam brdcast_mask same - * as ParamElemVisitor - * - * - * device interface: - * vect_type& at(uint32_t idx) - * ptr()[offset(idx)] - * - */ - template - class ParamVectVisitor; - - /* f{{{ ParamElemVisitor specializations */ +/*! + * \brief visitor to access an elemeent in a tensor at given logic index + * \tparam ctype plain element ctype (i.e. ctype in DTypeTrait) + * \tparam brdcast_mask bit mask for broadcast of params; (i.e. stride[i] is + * 0 iff (brdcast_mask & (1<<(ndim-1-i))) is 1. + * + * host interface: + * void host_init( + * const TensorND &tensor, int grid_size, int block_size) + * + * device interface: + * void thread_init(uint32_t idx) + * called on thread entrance, with logical indexing; the index may + * go beyond buffer range + * + * ctype* ptr() + * return buffer pointer; can be used by specialized OpCaller + * + * void next() + * called before moving to next chunk on each thread + * + * int offset(uint32_t idx) + * get physical offset from logical index + * + * ctype& at(uint32_t idx) + * ptr()[offset(idx)] + * + */ +template +class ParamElemVisitor; + +/*! + * \brief visitor to access vector element in a tensor at given logic index + * \tparam ctype same as ParamElemVisitor, vect_type packed vector type of + * element ctype (i.e. vect_type in VectTypeTrait) \tparam brdcast_mask same + * as ParamElemVisitor + * + * + * device interface: + * vect_type& at(uint32_t idx) + * ptr()[offset(idx)] + * + */ +template +class ParamVectVisitor; + +/* f{{{ ParamElemVisitor specializations */ #define PARAM_ELEM_VISITOR_COMMON_DEV \ devfunc ctype* ptr() { return m_ptr; } \ devfunc ctype& at(uint32_t idx) { return m_ptr[offset(idx)]; } #define PARAM_ELEM_VISITOR_COMMON_HOST static const int packed_size = 1; - //! specialization for BCAST_OTHER - template - class ParamElemVisitor { - protected: - ctype* __restrict m_ptr; +//! specialization for BCAST_OTHER +template +class ParamElemVisitor { +protected: + ctype* __restrict m_ptr; - private: - int m_stride[ndim]; +private: + int m_stride[ndim]; - //! m_shape_highdim[i] = original_shape[i + 1] + //! m_shape_highdim[i] = original_shape[i + 1] #ifdef _MSC_VER - Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1]; + Uint32Fastdiv m_shape_highdim[ndim > 1 ? ndim - 1 : 1]; #else - Uint32Fastdiv m_shape_highdim[ndim]; + Uint32Fastdiv m_shape_highdim[ndim]; #endif - public: - static const int NDIM = ndim; - PARAM_ELEM_VISITOR_COMMON_HOST +public: + static const int NDIM = ndim; + PARAM_ELEM_VISITOR_COMMON_HOST - void host_init(const TensorND& rv, int grid_size, int block_size); + void host_init(const TensorND& rv, int grid_size, int block_size); #if MEGDNN_CC_CUDA - devfunc void thread_init(uint32_t) {} + devfunc void thread_init(uint32_t) {} - devfunc void next() {} + devfunc void next() {} - devfunc int offset(uint32_t idx) { - int offset = 0; + devfunc int offset(uint32_t idx) { + int offset = 0; #pragma unroll - for (int i = ndim - 1; i >= 1; --i) { - Uint32Fastdiv& shp = m_shape_highdim[i - 1]; - uint32_t idx_div = idx / shp; - offset += (idx - idx_div * shp.divisor()) * m_stride[i]; - idx = idx_div; - } - offset += idx * m_stride[0]; - return offset; + for (int i = ndim - 1; i >= 1; --i) { + Uint32Fastdiv& shp = m_shape_highdim[i - 1]; + uint32_t idx_div = idx / shp; + offset += (idx - idx_div * shp.divisor()) * m_stride[i]; + idx = idx_div; } + offset += idx * m_stride[0]; + return offset; + } - PARAM_ELEM_VISITOR_COMMON_DEV + PARAM_ELEM_VISITOR_COMMON_DEV #endif - }; - - /*! - * \brief specialization for ndim == 3 and BCAST_101 - * (for dimshuffle 'x', 0, 'x') - * - * visit: idx / m_shape2 % m_shape1 - */ - template - class ParamElemVisitor<3, ctype, BCAST_101> { - StridedDivSeq2 m_shape12; - int m_stride1; - - protected: - ctype* __restrict m_ptr; - - public: - static const int NDIM = 3; - PARAM_ELEM_VISITOR_COMMON_HOST - - void host_init(const TensorND& rv, int grid_size, int block_size); +}; + +/*! + * \brief specialization for ndim == 3 and BCAST_101 + * (for dimshuffle 'x', 0, 'x') + * + * visit: idx / m_shape2 % m_shape1 + */ +template +class ParamElemVisitor<3, ctype, BCAST_101> { + StridedDivSeq2 m_shape12; + int m_stride1; + +protected: + ctype* __restrict m_ptr; + +public: + static const int NDIM = 3; + PARAM_ELEM_VISITOR_COMMON_HOST + + void host_init(const TensorND& rv, int grid_size, int block_size); #if MEGDNN_CC_CUDA - devfunc void thread_init(uint32_t idx) { m_shape12.device_init(idx); } + devfunc void thread_init(uint32_t idx) { m_shape12.device_init(idx); } - devfunc void next() { m_shape12.next(); } + devfunc void next() { m_shape12.next(); } - devfunc int offset(uint32_t idx) { return m_shape12.get() * m_stride1; } + devfunc int offset(uint32_t idx) { return m_shape12.get() * m_stride1; } - PARAM_ELEM_VISITOR_COMMON_DEV + PARAM_ELEM_VISITOR_COMMON_DEV #endif - }; +}; - /*! - * \brief specialization for ndim == 2 and BCAST_10 - * - * visit: idx % m_shape1 - */ - template - class ParamElemVisitor<2, ctype, BCAST_10> { - StridedDivSeq m_shape1; - int m_stride1; +/*! + * \brief specialization for ndim == 2 and BCAST_10 + * + * visit: idx % m_shape1 + */ +template +class ParamElemVisitor<2, ctype, BCAST_10> { + StridedDivSeq m_shape1; + int m_stride1; - protected: - ctype* __restrict m_ptr; +protected: + ctype* __restrict m_ptr; - public: - static const int NDIM = 2; - PARAM_ELEM_VISITOR_COMMON_HOST +public: + static const int NDIM = 2; + PARAM_ELEM_VISITOR_COMMON_HOST - void host_init(const TensorND& rv, int grid_size, int block_size); + void host_init(const TensorND& rv, int grid_size, int block_size); #if MEGDNN_CC_CUDA - devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); } + devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); } - devfunc void next() { m_shape1.next(); } + devfunc void next() { m_shape1.next(); } - devfunc int offset(uint32_t idx) { return m_shape1.r() * m_stride1; } + devfunc int offset(uint32_t idx) { return m_shape1.r() * m_stride1; } - PARAM_ELEM_VISITOR_COMMON_DEV + PARAM_ELEM_VISITOR_COMMON_DEV #endif - }; +}; - /*! - * \brief specialization for ndim == 2 and BCAST_01 - * - * visit: idx / shape1 - */ - template - class ParamElemVisitor<2, ctype, BCAST_01> { - StridedDivSeq m_shape1; - int m_stride0; +/*! + * \brief specialization for ndim == 2 and BCAST_01 + * + * visit: idx / shape1 + */ +template +class ParamElemVisitor<2, ctype, BCAST_01> { + StridedDivSeq m_shape1; + int m_stride0; - protected: - ctype* __restrict m_ptr; +protected: + ctype* __restrict m_ptr; - public: - static const int NDIM = 2; - PARAM_ELEM_VISITOR_COMMON_HOST +public: + static const int NDIM = 2; + PARAM_ELEM_VISITOR_COMMON_HOST - void host_init(const TensorND& rv, int grid_size, int block_size); + void host_init(const TensorND& rv, int grid_size, int block_size); #if MEGDNN_CC_CUDA - devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); } + devfunc void thread_init(uint32_t idx) { m_shape1.device_init(idx); } - devfunc void next() { m_shape1.next(); } + devfunc void next() { m_shape1.next(); } - devfunc int offset(uint32_t idx) { return m_shape1.q() * m_stride0; } + devfunc int offset(uint32_t idx) { return m_shape1.q() * m_stride0; } - PARAM_ELEM_VISITOR_COMMON_DEV + PARAM_ELEM_VISITOR_COMMON_DEV #endif - }; +}; - //! specialization for ndim == 1 and BCAST_FULL - template - class ParamElemVisitor<1, ctype, BCAST_FULL> { - protected: - ctype* __restrict m_ptr; +//! specialization for ndim == 1 and BCAST_FULL +template +class ParamElemVisitor<1, ctype, BCAST_FULL> { +protected: + ctype* __restrict m_ptr; - public: - static const int NDIM = 1; - PARAM_ELEM_VISITOR_COMMON_HOST +public: + static const int NDIM = 1; + PARAM_ELEM_VISITOR_COMMON_HOST - void host_init(const TensorND& rv, int grid_size, int block_size); + void host_init(const TensorND& rv, int grid_size, int block_size); #if MEGDNN_CC_CUDA - devfunc void thread_init(uint32_t) {} + devfunc void thread_init(uint32_t) {} - devfunc void next() {} + devfunc void next() {} - devfunc int offset(uint32_t idx) { - MEGDNN_MARK_USED_VAR(idx); - return 0; - } + devfunc int offset(uint32_t idx) { + MEGDNN_MARK_USED_VAR(idx); + return 0; + } - PARAM_ELEM_VISITOR_COMMON_DEV + PARAM_ELEM_VISITOR_COMMON_DEV #endif - }; +}; #undef PARAM_ELEM_VISITOR_COMMON_DEV #undef PARAM_ELEM_VISITOR_COMMON_HOST - /* f}}} */ +/* f}}} */ - /* f{{{ ParamVectVisitor specializations */ +/* f{{{ ParamVectVisitor specializations */ #if MEGDNN_CC_CUDA #define DEVICE_WRAPPER(x) x @@ -352,16 +353,16 @@ namespace elemwise_intl { }) \ }; #define _brdcast_mask BCAST_OTHER - INST_PARAM_VECT_VISITOR; +INST_PARAM_VECT_VISITOR; #undef _brdcast_mask #define _brdcast_mask BCAST_01 - INST_PARAM_VECT_VISITOR; +INST_PARAM_VECT_VISITOR; #undef _brdcast_mask #define _brdcast_mask BCAST_10 - INST_PARAM_VECT_VISITOR; +INST_PARAM_VECT_VISITOR; #undef _brdcast_mask #define _brdcast_mask BCAST_101 - INST_PARAM_VECT_VISITOR; +INST_PARAM_VECT_VISITOR; #undef _brdcast_mask #define INST_DT_IBYTE(ctype) \ template \ @@ -379,463 +380,453 @@ namespace elemwise_intl { return vect_scalar; \ }) \ } - INST_DT_IBYTE(dt_int8); - INST_DT_IBYTE(dt_uint8); - INST_DT_IBYTE(dt_qint8); - INST_DT_IBYTE(dt_quint8); +INST_DT_IBYTE(dt_int8); +INST_DT_IBYTE(dt_uint8); +INST_DT_IBYTE(dt_qint8); +INST_DT_IBYTE(dt_quint8); #undef INST_DT_IBYTE #undef DEVICE_WRAPPER #undef INST_PARAM_VECT_VISITOR - /*! - * \brief specialization for ndim == 4 and BCAST_1010 - * - * visit: (idx % m_shape3) * m_stride3 + (idx / m_shape23 % m_shape1) * - * m_stride1 - */ - template - class ParamVectVisitor<4, ctype, BCAST_1010> { - StridedDivSeq2 m_shape123; - StridedDivSeq m_shape3; - int m_stride3, m_stride1; - ctype* __restrict m_ptr; - - public: - static const int NDIM = 4; - using rwtype = typename VectTypeTrait::vect_type; - static const int packed_size = sizeof(rwtype) / sizeof(ctype); - - void host_init(const TensorND& rv, int grid_size, int block_size); +/*! + * \brief specialization for ndim == 4 and BCAST_1010 + * + * visit: (idx % m_shape3) * m_stride3 + (idx / m_shape23 % m_shape1) * + * m_stride1 + */ +template +class ParamVectVisitor<4, ctype, BCAST_1010> { + StridedDivSeq2 m_shape123; + StridedDivSeq m_shape3; + int m_stride3, m_stride1; + ctype* __restrict m_ptr; + +public: + static const int NDIM = 4; + using rwtype = typename VectTypeTrait::vect_type; + static const int packed_size = sizeof(rwtype) / sizeof(ctype); + + void host_init(const TensorND& rv, int grid_size, int block_size); #if MEGDNN_CC_CUDA - devfunc void thread_init(uint32_t idx) { - m_shape123.device_init(idx); - m_shape3.device_init(idx); - } + devfunc void thread_init(uint32_t idx) { + m_shape123.device_init(idx); + m_shape3.device_init(idx); + } - devfunc void next() { - m_shape123.next(); - m_shape3.next(); - } + devfunc void next() { + m_shape123.next(); + m_shape3.next(); + } - devfunc int offset(uint32_t idx) { - return m_shape3.r() * m_stride3 + m_shape123.get() * m_stride1; - } + devfunc int offset(uint32_t idx) { + return m_shape3.r() * m_stride3 + m_shape123.get() * m_stride1; + } - devfunc ctype* ptr() { return m_ptr; } - devfunc rwtype& at(uint32_t idx) { - return *(rwtype*)(&m_ptr[offset(idx)]); - } + devfunc ctype* ptr() { return m_ptr; } + devfunc rwtype& at(uint32_t idx) { return *(rwtype*)(&m_ptr[offset(idx)]); } #endif - }; - - /* f}}} */ +}; +/* f}}} */ #if MEGDNN_CC_CUDA - /* f{{{ user operator callers */ - - /* - * OpCaller is used to invoke user operator with loaded element arguments. - * - * device interface: - * void thread_init(uint32_t idx); - * - * void on(uint32_t idx); - * - * void next(); - */ - - /*! - * \brief call user op directly without visiting any params (i.e. arity == - * 0) - */ - template - struct OpCallerNull { - Op op; - - devfunc void thread_init(uint32_t) { - } +/* f{{{ user operator callers */ - devfunc void on(uint32_t idx) { - op(idx); - } +/* + * OpCaller is used to invoke user operator with loaded element arguments. + * + * device interface: + * void thread_init(uint32_t idx); + * + * void on(uint32_t idx); + * + * void next(); + */ - devfunc void next() { - } - }; +/*! + * \brief call user op directly without visiting any params (i.e. arity == + * 0) + */ +template +struct OpCallerNull { + Op op; - /*! - * \brief call an operator whose each param are promted to the same ndim and - * brdcast_mask - * \tparam PVis ParamElemVisitor class - */ - template - struct OpCallerUniform; - - //! specialization for arity == 1 - template - struct OpCallerUniform { - Op op; - PVis par[1]; - static const uint32_t packed_size = PVis::packed_size; - - devfunc void thread_init(uint32_t idx) { - idx = idx * packed_size; - par[0].thread_init(idx); - } + devfunc void thread_init(uint32_t) {} - devfunc void on(uint32_t idx) { - idx = idx * packed_size; - op(idx, par[0].at(idx)); - } + devfunc void on(uint32_t idx) { op(idx); } - devfunc void on(uint32_t idx, uint32_t remain) { - idx = idx * packed_size; - if (remain >= packed_size) { - op(idx, par[0].at(idx)); - } else { - auto ptr0 = par[0].ptr(); - for (int i = 0; i < remain; i++) { - op(idx + i, ptr0[par[0].offset(idx + i)]); - } - } - } + devfunc void next() {} +}; - devfunc void next() { - par[0].next(); - } - }; - //! specialization for arity == 2 - template - struct OpCallerUniform { - Op op; - PVis par[2]; - static const uint32_t packed_size = PVis::packed_size; - - devfunc void thread_init(uint32_t idx) { - idx = idx * packed_size; - par[0].thread_init(idx); - par[1].thread_init(idx); - } +/*! + * \brief call an operator whose each param are promted to the same ndim and + * brdcast_mask + * \tparam PVis ParamElemVisitor class + */ +template +struct OpCallerUniform; + +//! specialization for arity == 1 +template +struct OpCallerUniform { + Op op; + PVis par[1]; + static const uint32_t packed_size = PVis::packed_size; + + devfunc void thread_init(uint32_t idx) { + idx = idx * packed_size; + par[0].thread_init(idx); + } - devfunc void on(uint32_t idx) { - idx = idx * packed_size; - op(idx, par[0].at(idx), par[1].at(idx)); - } + devfunc void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par[0].at(idx)); + } - devfunc void on(uint32_t idx, uint32_t remain) { - idx = idx * packed_size; - if (remain >= packed_size) { - op(idx, par[0].at(idx), par[1].at(idx)); - } else { - auto ptr0 = par[0].ptr(); - auto ptr1 = par[1].ptr(); - for (int i = 0; i < remain; i++) { - op(idx + i, ptr0[par[0].offset(idx + i)], - ptr1[par[1].offset(idx + i)]); - } + devfunc void on(uint32_t idx, uint32_t remain) { + idx = idx * packed_size; + if (remain >= packed_size) { + op(idx, par[0].at(idx)); + } else { + auto ptr0 = par[0].ptr(); + for (int i = 0; i < remain; i++) { + op(idx + i, ptr0[par[0].offset(idx + i)]); } } + } - devfunc void next() { - par[0].next(); - par[1].next(); - } - }; - //! specialization for arity == 3 - template - struct OpCallerUniform { - Op op; - PVis par[3]; - static const uint32_t packed_size = PVis::packed_size; - - devfunc void thread_init(uint32_t idx) { - idx = idx * packed_size; - par[0].thread_init(idx); - par[1].thread_init(idx); - par[2].thread_init(idx); - } + devfunc void next() { par[0].next(); } +}; +//! specialization for arity == 2 +template +struct OpCallerUniform { + Op op; + PVis par[2]; + static const uint32_t packed_size = PVis::packed_size; + + devfunc void thread_init(uint32_t idx) { + idx = idx * packed_size; + par[0].thread_init(idx); + par[1].thread_init(idx); + } - devfunc void on(uint32_t idx) { - idx = idx * packed_size; - op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx)); - } + devfunc void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par[0].at(idx), par[1].at(idx)); + } - devfunc void on(uint32_t idx, uint32_t remain) { - idx = idx * packed_size; - if (remain >= packed_size) { - op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx)); - } else { - auto ptr0 = par[0].ptr(); - auto ptr1 = par[1].ptr(); - auto ptr2 = par[2].ptr(); - for (int i = 0; i < remain; i++) { - op(idx + i, ptr0[par[0].offset(idx + i)], - ptr1[par[1].offset(idx + i)], - ptr2[par[2].offset(idx + i)]); - } + devfunc void on(uint32_t idx, uint32_t remain) { + idx = idx * packed_size; + if (remain >= packed_size) { + op(idx, par[0].at(idx), par[1].at(idx)); + } else { + auto ptr0 = par[0].ptr(); + auto ptr1 = par[1].ptr(); + for (int i = 0; i < remain; i++) { + op(idx + i, ptr0[par[0].offset(idx + i)], + ptr1[par[1].offset(idx + i)]); } } + } - devfunc void next() { - par[0].next(); - par[1].next(); - par[2].next(); - } - }; + devfunc void next() { + par[0].next(); + par[1].next(); + } +}; +//! specialization for arity == 3 +template +struct OpCallerUniform { + Op op; + PVis par[3]; + static const uint32_t packed_size = PVis::packed_size; + + devfunc void thread_init(uint32_t idx) { + idx = idx * packed_size; + par[0].thread_init(idx); + par[1].thread_init(idx); + par[2].thread_init(idx); + } - /*! - * \brief call binary (i.e. arity == 2) operator with different param - * visitors - */ - template - struct OpCallerBinary { - Op op; - PVis0 par0; - PVis1 par1; - MEGDNN_STATIC_ASSERT(PVis0::packed_size == PVis1::packed_size, - "vector size mismatch") - - static const uint32_t packed_size = PVis0::packed_size; - - devfunc void thread_init(uint32_t idx) { - idx = idx * packed_size; - par0.thread_init(idx); - par1.thread_init(idx); - } + devfunc void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx)); + } - devfunc void on(uint32_t idx) { - idx = idx * packed_size; - op(idx, par0.at(idx), par1.at(idx)); + devfunc void on(uint32_t idx, uint32_t remain) { + idx = idx * packed_size; + if (remain >= packed_size) { + op(idx, par[0].at(idx), par[1].at(idx), par[2].at(idx)); + } else { + auto ptr0 = par[0].ptr(); + auto ptr1 = par[1].ptr(); + auto ptr2 = par[2].ptr(); + for (int i = 0; i < remain; i++) { + op(idx + i, ptr0[par[0].offset(idx + i)], + ptr1[par[1].offset(idx + i)], ptr2[par[2].offset(idx + i)]); + } } + } - devfunc void next() { - par0.next(); - par1.next(); - } - }; + devfunc void next() { + par[0].next(); + par[1].next(); + par[2].next(); + } +}; + +/*! + * \brief call binary (i.e. arity == 2) operator with different param + * visitors + */ +template +struct OpCallerBinary { + Op op; + PVis0 par0; + PVis1 par1; + MEGDNN_STATIC_ASSERT(PVis0::packed_size == PVis1::packed_size, + "vector size mismatch") + + static const uint32_t packed_size = PVis0::packed_size; + + devfunc void thread_init(uint32_t idx) { + idx = idx * packed_size; + par0.thread_init(idx); + par1.thread_init(idx); + } - /* f}}} */ + devfunc void on(uint32_t idx) { + idx = idx * packed_size; + op(idx, par0.at(idx), par1.at(idx)); + } - template - __global__ void cuda_kern(OpCaller op_caller, uint32_t size) { - uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x, - delta = blockDim.x * gridDim.x; - // each thread works on at most 3 elements; see get_launch_spec - op_caller.thread_init(idx); + devfunc void next() { + par0.next(); + par1.next(); + } +}; + +/* f}}} */ + +template +__global__ void cuda_kern(OpCaller op_caller, uint32_t size) { + uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x, + delta = blockDim.x * gridDim.x; + // each thread works on at most 3 elements; see get_launch_spec + op_caller.thread_init(idx); + if (idx < size) { + op_caller.on(idx); + idx += delta; if (idx < size) { + op_caller.next(); op_caller.on(idx); idx += delta; if (idx < size) { op_caller.next(); op_caller.on(idx); - idx += delta; - if (idx < size) { - op_caller.next(); - op_caller.on(idx); - } } } } +} - template - __global__ void cuda_kern(OpCallerUniform op_caller, - uint32_t size) { - constexpr uint32_t packed_size = PVis::packed_size; - const uint32_t size_packed = DIVUP(size, packed_size); - uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x, - delta = blockDim.x * gridDim.x; +template +__global__ void cuda_kern(OpCallerUniform op_caller, + uint32_t size) { + constexpr uint32_t packed_size = PVis::packed_size; + const uint32_t size_packed = DIVUP(size, packed_size); + uint32_t idx = blockIdx.x * blockDim.x + threadIdx.x, + delta = blockDim.x * gridDim.x; + if (idx < size_packed) { + op_caller.on(idx, size - packed_size * idx); + idx += delta; if (idx < size_packed) { op_caller.on(idx, size - packed_size * idx); idx += delta; if (idx < size_packed) { op_caller.on(idx, size - packed_size * idx); - idx += delta; - if (idx < size_packed) { - op_caller.on(idx, size - packed_size * idx); - } } } } +} - //! invoke a user Op passed to run_elemwise - template - class UserOpInvoker; +//! invoke a user Op passed to run_elemwise +template +class UserOpInvoker; - /* f{{{ UserOpInvoker specializations */ +/* f{{{ UserOpInvoker specializations */ - //! run op by promoting all params to same ndim - template - class UserOpInvokerToSameNdim { - const ElemwiseOpParamN &m_param; - cudaStream_t m_stream; - const Op &m_op; +//! run op by promoting all params to same ndim +template +class UserOpInvokerToSameNdim { + const ElemwiseOpParamN& m_param; + cudaStream_t m_stream; + const Op& m_op; - void dispatch0() { - switch(m_param.max_ndim) { + void dispatch0() { + switch (m_param.max_ndim) { #define cb(ndim) \ - case ndim: return dispatch1(); - MEGDNN_FOREACH_TENSOR_NDIM(cb) + case ndim: \ + return dispatch1(); + MEGDNN_FOREACH_TENSOR_NDIM(cb) #undef cb - } - on_bad_ndim(m_param.max_ndim); } + on_bad_ndim(m_param.max_ndim); + } - template - void dispatch1() { - typedef OpCallerUniform< - Op, arity, - ParamElemVisitor> - Caller; - size_t size = m_param.size; - int grid_size, block_size; - void (*fptr)(Caller, uint32_t) = cuda_kern; - get_launch_spec(reinterpret_cast(fptr), size, - &grid_size, &block_size); - - Caller caller; - caller.op = m_op; - for (int i = 0; i < arity; ++i) - caller.par[i].host_init(m_param[i], grid_size, block_size); - (*fptr)<<>>(caller, size); - after_kernel_launch(); - } + template + void dispatch1() { + typedef OpCallerUniform> + Caller; + size_t size = m_param.size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } - public: - UserOpInvokerToSameNdim(const ElemwiseOpParamN& param, - cudaStream_t stream, const Op& op) - : m_param(param), m_stream(stream), m_op(op) { - dispatch0(); - } - }; +public: + UserOpInvokerToSameNdim(const ElemwiseOpParamN& param, + cudaStream_t stream, const Op& op) + : m_param(param), m_stream(stream), m_op(op) { + dispatch0(); + } +}; - template - class UserOpInvokerToSameNdimIByteHelper { - public: - UserOpInvokerToSameNdimIByteHelper(const ElemwiseOpParamN& param, - cudaStream_t stream, const Op& op) - : m_rw_size(param.size), - m_param(param), - m_stream(stream), - m_op(op) { - if (!try_vect_load_store_contiguous() && !try_vect_load_store()) { - dispatch0(); - } +template +class UserOpInvokerToSameNdimIByteHelper { +public: + UserOpInvokerToSameNdimIByteHelper(const ElemwiseOpParamN& param, + cudaStream_t stream, const Op& op) + : m_rw_size(param.size), + m_param(param), + m_stream(stream), + m_op(op) { + if (!try_vect_load_store_contiguous() && !try_vect_load_store()) { + dispatch0(); } + } - private: - const ElemwiseOpParamN& m_param; - size_t m_rw_size; - cudaStream_t m_stream; - const Op& m_op; - using vect_type = typename VectTypeTrait::vect_type; - static const size_t packed_size = VectTypeTrait::packed_size; +private: + const ElemwiseOpParamN& m_param; + size_t m_rw_size; + cudaStream_t m_stream; + const Op& m_op; + using vect_type = typename VectTypeTrait::vect_type; + static const size_t packed_size = VectTypeTrait::packed_size; - void dispatch0() { - switch (m_param.max_ndim) { + void dispatch0() { + switch (m_param.max_ndim) { #define cb(ndim) \ case ndim: \ return dispatch1(); - MEGDNN_FOREACH_TENSOR_NDIM(cb) + MEGDNN_FOREACH_TENSOR_NDIM(cb) #undef cb - } - on_bad_ndim(m_param.max_ndim); } + on_bad_ndim(m_param.max_ndim); + } - void dispatch0_vect() { - switch (m_param.max_ndim) { + void dispatch0_vect() { + switch (m_param.max_ndim) { #define cb(ndim) \ case ndim: \ return dispatch1_vect(); - MEGDNN_FOREACH_TENSOR_NDIM(cb) + MEGDNN_FOREACH_TENSOR_NDIM(cb) #undef cb - } - on_bad_ndim(m_param.max_ndim); } + on_bad_ndim(m_param.max_ndim); + } - void dispatch_contiguous() { - typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis; - typedef OpCallerUniform Caller; - size_t size = m_rw_size; - int grid_size, block_size; - void (*fptr)(Caller, uint32_t) = cuda_kern; - get_launch_spec(reinterpret_cast(fptr), size, - &grid_size, &block_size); - - Caller caller; - caller.op = m_op; - for (int i = 0; i < arity; ++i) - caller.par[i].host_init(m_param[i], grid_size, block_size); - (*fptr)<<>>(caller, - m_param.size); - after_kernel_launch(); - } + void dispatch_contiguous() { + typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis; + typedef OpCallerUniform Caller; + size_t size = m_rw_size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, m_param.size); + after_kernel_launch(); + } - template - void dispatch1() { - typedef ParamElemVisitor PVis; - typedef OpCallerUniform Caller; - size_t size = m_rw_size; - int grid_size, block_size; - void (*fptr)(Caller, uint32_t) = cuda_kern; - get_launch_spec(reinterpret_cast(fptr), size, - &grid_size, &block_size); - Caller caller; - caller.op = m_op; - for (int i = 0; i < arity; ++i) - caller.par[i].host_init(m_param[i], grid_size, block_size); - (*fptr)<<>>(caller, size); - after_kernel_launch(); - } + template + void dispatch1() { + typedef ParamElemVisitor PVis; + typedef OpCallerUniform Caller; + size_t size = m_rw_size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } - template - void dispatch1_vect() { - typedef ParamVectVisitor PVis; - typedef OpCallerUniform Caller; - size_t size = m_rw_size; - int grid_size, block_size; - void (*fptr)(Caller, uint32_t) = cuda_kern; - get_launch_spec(reinterpret_cast(fptr), size, - &grid_size, &block_size); - Caller caller; - caller.op = m_op; - for (int i = 0; i < arity; ++i) - caller.par[i].host_init(m_param[i], grid_size, block_size); - (*fptr)<<>>(caller, size); - after_kernel_launch(); - } + template + void dispatch1_vect() { + typedef ParamVectVisitor PVis; + typedef OpCallerUniform Caller; + size_t size = m_rw_size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + Caller caller; + caller.op = m_op; + for (int i = 0; i < arity; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } - bool try_vect_load_store() { - auto try_last_contig = [](const TensorLayout& layout) { - return layout.stride[layout.ndim - 1] == 1 && - layout[layout.ndim - 1] % packed_size == 0; - }; - /* - * \NOTE: remove try_scalar() to adapt multi-type tenary op - */ - for (int i = 0; i < arity; ++i) { - if (!try_last_contig(m_param[i].layout)) return false; - } - m_rw_size /= packed_size; - dispatch0_vect(); - return true; + bool try_vect_load_store() { + auto try_last_contig = [](const TensorLayout& layout) { + return layout.stride[layout.ndim - 1] == 1 && + layout[layout.ndim - 1] % packed_size == 0; + }; + /* + * \NOTE: remove try_scalar() to adapt multi-type tenary op + */ + for (int i = 0; i < arity; ++i) { + if (!try_last_contig(m_param[i].layout)) + return false; } + m_rw_size /= packed_size; + dispatch0_vect(); + return true; + } - bool try_vect_load_store_contiguous() { - auto try_contig = [](const TensorLayout& layout) { - return (layout.is_contiguous()); - }; - for (int i = 0; i < arity; ++i) { - if (!try_contig(m_param[i].layout)) - return false; - } - m_rw_size = DIVUP(m_rw_size, packed_size); - dispatch_contiguous(); - return true; + bool try_vect_load_store_contiguous() { + auto try_contig = [](const TensorLayout& layout) { + return (layout.is_contiguous()); + }; + for (int i = 0; i < arity; ++i) { + if (!try_contig(m_param[i].layout)) + return false; } - }; + m_rw_size = DIVUP(m_rw_size, packed_size); + dispatch_contiguous(); + return true; + } +}; #define INST_DT_IBYTE(ctype) \ template \ @@ -848,154 +839,148 @@ namespace elemwise_intl { cudaStream_t stream, const Op& op) \ : Super{param, stream, op} {} \ } - INST_DT_IBYTE(dt_int8); - INST_DT_IBYTE(dt_uint8); - INST_DT_IBYTE(dt_qint8); - INST_DT_IBYTE(dt_quint8); +INST_DT_IBYTE(dt_int8); +INST_DT_IBYTE(dt_uint8); +INST_DT_IBYTE(dt_qint8); +INST_DT_IBYTE(dt_quint8); #undef INST_DT_IBYTE - //! implement general case by UserOpInvokerToSameNdim - template - class UserOpInvoker: public UserOpInvokerToSameNdim { - public: - UserOpInvoker( - const ElemwiseOpParamN ¶m, - cudaStream_t stream, - const Op &op): - UserOpInvokerToSameNdim(param, stream, op) - { - } - }; - - //! specialization for arity == 0 - template - class UserOpInvoker { - public: - UserOpInvoker( - const ElemwiseOpParamN<0> ¶m, - cudaStream_t stream, - const Op &op) { - size_t size = param.size; - typedef OpCallerNull Caller; - Caller caller; - caller.op = op; - int grid_size, block_size; - void (*fptr)(Caller, uint32_t) = cuda_kern; - get_launch_spec(reinterpret_cast(fptr), size, - &grid_size, &block_size); - (*fptr) <<< grid_size, block_size, 0, stream >>> (caller, size); - after_kernel_launch(); - } - }; +//! implement general case by UserOpInvokerToSameNdim +template +class UserOpInvoker : public UserOpInvokerToSameNdim { +public: + UserOpInvoker(const ElemwiseOpParamN& param, cudaStream_t stream, + const Op& op) + : UserOpInvokerToSameNdim(param, stream, op) {} +}; + +//! specialization for arity == 0 +template +class UserOpInvoker { +public: + UserOpInvoker(const ElemwiseOpParamN<0>& param, cudaStream_t stream, + const Op& op) { + size_t size = param.size; + typedef OpCallerNull Caller; + Caller caller; + caller.op = op; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } +}; #define DEFINE_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, _stride) \ - _cb_header(1) { \ - const ptrdiff_t *stride = _stride; \ - if (!stride[0]) { \ - return _cb_dispatch(1, BCAST_FULL); \ - } \ - _cb_dispatch(1, BCAST_OTHER); \ - } \ - _cb_header(2) { \ - const ptrdiff_t *stride = _stride; \ - if (!stride[0] && stride[1]) { \ - return _cb_dispatch(2, BCAST_10); \ - } \ - if (stride[0] && !stride[1]) { \ - return _cb_dispatch(2, BCAST_01); \ - } \ - _cb_dispatch(2, BCAST_OTHER); \ - } \ - _cb_header(3) { \ - const ptrdiff_t *stride = _stride; \ - if (!stride[0] && stride[1] && !stride[2]) { \ - return _cb_dispatch(3, BCAST_101); \ - } \ - _cb_dispatch(3, BCAST_OTHER); \ + _cb_header(1) { \ + const ptrdiff_t* stride = _stride; \ + if (!stride[0]) { \ + return _cb_dispatch(1, BCAST_FULL); \ + } \ + _cb_dispatch(1, BCAST_OTHER); \ + } \ + _cb_header(2) { \ + const ptrdiff_t* stride = _stride; \ + if (!stride[0] && stride[1]) { \ + return _cb_dispatch(2, BCAST_10); \ + } \ + if (stride[0] && !stride[1]) { \ + return _cb_dispatch(2, BCAST_01); \ + } \ + _cb_dispatch(2, BCAST_OTHER); \ + } \ + _cb_header(3) { \ + const ptrdiff_t* stride = _stride; \ + if (!stride[0] && stride[1] && !stride[2]) { \ + return _cb_dispatch(3, BCAST_101); \ + } \ + _cb_dispatch(3, BCAST_OTHER); \ } - //! specialization for binary opr - template - class UserOpInvoker { - bool m_invoked; - const ElemwiseOpParamN<2> &m_param; - cudaStream_t m_stream; - const Op &m_op; - - void fallback() { - megdnn_assert(!m_invoked); - UserOpInvokerToSameNdim(m_param, m_stream, m_op); - m_invoked = true; - } +//! specialization for binary opr +template +class UserOpInvoker { + bool m_invoked; + const ElemwiseOpParamN<2>& m_param; + cudaStream_t m_stream; + const Op& m_op; + + void fallback() { + megdnn_assert(!m_invoked); + UserOpInvokerToSameNdim(m_param, m_stream, m_op); + m_invoked = true; + } - void dispatch0() { - switch(m_param[0].layout.ndim) { + void dispatch0() { + switch (m_param[0].layout.ndim) { #define cb(ndim) \ - case ndim: return dispatch1_##ndim(); - MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) + case ndim: \ + return dispatch1_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) #undef cb - } - fallback(); } + fallback(); + } #define cb_header(ndim) void dispatch1_##ndim() #define cb_dispatch(ndim, brdcast_mask) \ - dispatch2 >() -DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, - m_param[0].layout.stride) + dispatch2>() + DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[0].layout.stride) #undef cb_header #undef cb_dispatch - - template - void dispatch2() { - switch(m_param[1].layout.ndim) { + template + void dispatch2() { + switch (m_param[1].layout.ndim) { #define cb(ndim) \ - case ndim: return dispatch3_##ndim(); - MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) + case ndim: \ + return dispatch3_##ndim(); + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) #undef cb - } - fallback(); } + fallback(); + } -#define cb_header(ndim) \ - template \ +#define cb_header(ndim) \ + template \ void dispatch3_##ndim() #define cb_dispatch(ndim, brdcast_mask) \ - do_run >() -DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, - m_param[1].layout.stride) + do_run>() + DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[1].layout.stride) #undef cb_header #undef cb_dispatch - template - void do_run() { - megdnn_assert(!m_invoked); - m_invoked = true; - typedef OpCallerBinary Caller; - int grid_size, block_size; - void (*fptr)(Caller, uint32_t) = cuda_kern; - size_t size = m_param.size; - get_launch_spec(reinterpret_cast(fptr), - size, &grid_size, &block_size); - Caller caller; - caller.op = m_op; - caller.par0.host_init(m_param[0], grid_size, block_size); - caller.par1.host_init(m_param[1], grid_size, block_size); - (*fptr) <<< grid_size, block_size, 0, m_stream >>> (caller, size); - after_kernel_launch(); - } + template + void do_run() { + megdnn_assert(!m_invoked); + m_invoked = true; + typedef OpCallerBinary Caller; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + size_t size = m_param.size; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + Caller caller; + caller.op = m_op; + caller.par0.host_init(m_param[0], grid_size, block_size); + caller.par1.host_init(m_param[1], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } - public: - UserOpInvoker(const ElemwiseOpParamN<2> ¶m, cudaStream_t stream, - const Op &op): - m_param(param), m_stream(stream), m_op(op) - { - m_invoked = false; - dispatch0(); - megdnn_assert(m_invoked); - } - }; +public: + UserOpInvoker(const ElemwiseOpParamN<2>& param, cudaStream_t stream, + const Op& op) + : m_param(param), m_stream(stream), m_op(op) { + m_invoked = false; + dispatch0(); + megdnn_assert(m_invoked); + } +}; #define DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(_cb_header, _cb_dispatch, \ _stride) \ @@ -1008,142 +993,141 @@ DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, _cb_dispatch(4, BCAST_OTHER); \ } - template - class UserOpInvokerBinaryIByteHelper { - private: - bool m_invoked; - size_t m_rw_size; - const ElemwiseOpParamN<2>& m_param; - cudaStream_t m_stream; - const Op& m_op; - using vect_type = typename VectTypeTrait::vect_type; - static const size_t packed_size = VectTypeTrait::packed_size; - bool try_vect_load_store() { - auto try_last_contig_or_scalar = [](const TensorLayout& layout) { - return (layout.stride[layout.ndim - 1] == 1 && - layout[layout.ndim - 1] % packed_size == 0) || - (layout.ndim == 1 && layout.stride[0] == 0); - }; - for (int i = 0; i < 2; ++i) { - if (!try_last_contig_or_scalar(m_param[i].layout)) - return false; - } - m_rw_size /= packed_size; - dispatch0_vect(); - return true; +template +class UserOpInvokerBinaryIByteHelper { +private: + bool m_invoked; + size_t m_rw_size; + const ElemwiseOpParamN<2>& m_param; + cudaStream_t m_stream; + const Op& m_op; + using vect_type = typename VectTypeTrait::vect_type; + static const size_t packed_size = VectTypeTrait::packed_size; + bool try_vect_load_store() { + auto try_last_contig_or_scalar = [](const TensorLayout& layout) { + return (layout.stride[layout.ndim - 1] == 1 && + layout[layout.ndim - 1] % packed_size == 0) || + (layout.ndim == 1 && layout.stride[0] == 0); + }; + for (int i = 0; i < 2; ++i) { + if (!try_last_contig_or_scalar(m_param[i].layout)) + return false; } + m_rw_size /= packed_size; + dispatch0_vect(); + return true; + } - bool try_vect_load_store_contiguous() { - auto try_contig = [](const TensorLayout& layout) { - return (layout.is_contiguous()); - }; - for (int i = 0; i < 2; ++i) { - if (!try_contig(m_param[i].layout)) - return false; - } - m_rw_size = DIVUP(m_rw_size, packed_size); - dispatch_contiguous(); - return true; + bool try_vect_load_store_contiguous() { + auto try_contig = [](const TensorLayout& layout) { + return (layout.is_contiguous()); + }; + for (int i = 0; i < 2; ++i) { + if (!try_contig(m_param[i].layout)) + return false; } + m_rw_size = DIVUP(m_rw_size, packed_size); + dispatch_contiguous(); + return true; + } - void dispatch0() { - switch (m_param[0].layout.ndim) { + void dispatch0() { + switch (m_param[0].layout.ndim) { #define cb(ndim) \ case ndim: \ return dispatch1_##ndim(); - MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) #undef cb - } - fallback(); } + fallback(); + } - void dispatch0_vect() { - switch (m_param[0].layout.ndim) { + void dispatch0_vect() { + switch (m_param[0].layout.ndim) { #define cb(ndim) \ case ndim: \ return dispatch1_vect_##ndim(); - MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) #undef cb - case 4: - return dispatch1_vect_4(); - } - fallback(); + case 4: + return dispatch1_vect_4(); } + fallback(); + } - void dispatch_contiguous() { - m_invoked = true; - typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis; - typedef OpCallerUniform Caller; - size_t size = m_rw_size; - int grid_size, block_size; - void (*fptr)(Caller, uint32_t) = cuda_kern; - get_launch_spec(reinterpret_cast(fptr), size, - &grid_size, &block_size); - - Caller caller; - caller.op = m_op; - for (int i = 0; i < 2; ++i) - caller.par[i].host_init(m_param[i], grid_size, block_size); - (*fptr)<<>>(caller, - m_param.size); - after_kernel_launch(); - } + void dispatch_contiguous() { + m_invoked = true; + typedef ParamVectVisitor<1, ctype, BCAST_OTHER> PVis; + typedef OpCallerUniform Caller; + size_t size = m_rw_size; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + + Caller caller; + caller.op = m_op; + for (int i = 0; i < 2; ++i) + caller.par[i].host_init(m_param[i], grid_size, block_size); + (*fptr)<<>>(caller, m_param.size); + after_kernel_launch(); + } - void fallback() { - megdnn_assert(!m_invoked); - UserOpInvokerToSameNdim(m_param, m_stream, m_op); - m_invoked = true; - } + void fallback() { + megdnn_assert(!m_invoked); + UserOpInvokerToSameNdim(m_param, m_stream, m_op); + m_invoked = true; + } #define cb_header(ndim) void dispatch1_##ndim() #define cb_dispatch(ndim, brdcast_mask) \ dispatch2>() - DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, - m_param[0].layout.stride) + DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[0].layout.stride) #undef cb_header #undef cb_dispatch #define cb_header(ndim) void dispatch1_vect_##ndim() #define cb_dispatch(ndim, brdcast_mask) \ dispatch2_vect>() - DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, - m_param[0].layout.stride) + DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[0].layout.stride) #undef cb_header #undef cb_dispatch - template - void dispatch2() { - switch (m_param[1].layout.ndim) { + template + void dispatch2() { + switch (m_param[1].layout.ndim) { #define cb(ndim) \ case ndim: \ return dispatch3_##ndim(); - MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) #undef cb - } - fallback(); } + fallback(); + } - template - void dispatch2_vect() { - switch (m_param[1].layout.ndim) { + template + void dispatch2_vect() { + switch (m_param[1].layout.ndim) { #define cb(ndim) \ case ndim: \ return dispatch3_vect_##ndim(); - MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) + MEGDNN_FOREACH_TENSOR_NDIM_SMALL(cb) #undef cb - case 4: - return dispatch3_vect_4(); - } - fallback(); + case 4: + return dispatch3_vect_4(); } + fallback(); + } #define cb_header(ndim) \ template \ void dispatch3_##ndim() #define cb_dispatch(ndim, brdcast_mask) \ do_run>() - DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, - m_param[1].layout.stride) + DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[1].layout.stride) #undef cb_header #undef cb_dispatch @@ -1152,43 +1136,43 @@ DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, void dispatch3_vect_##ndim() #define cb_dispatch(ndim, brdcast_mask) \ do_run>() - DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, - m_param[1].layout.stride) + DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, + m_param[1].layout.stride) #undef cb_header #undef cb_dispatch - template - void do_run() { - megdnn_assert(!m_invoked); - m_invoked = true; - typedef OpCallerBinary Caller; - int grid_size, block_size; - void (*fptr)(Caller, uint32_t) = cuda_kern; - size_t size = m_rw_size; - get_launch_spec(reinterpret_cast(fptr), size, - &grid_size, &block_size); - Caller caller; - caller.op = m_op; - caller.par0.host_init(m_param[0], grid_size, block_size); - caller.par1.host_init(m_param[1], grid_size, block_size); - (*fptr)<<>>(caller, size); - after_kernel_launch(); - } + template + void do_run() { + megdnn_assert(!m_invoked); + m_invoked = true; + typedef OpCallerBinary Caller; + int grid_size, block_size; + void (*fptr)(Caller, uint32_t) = cuda_kern; + size_t size = m_rw_size; + get_launch_spec(reinterpret_cast(fptr), size, &grid_size, + &block_size); + Caller caller; + caller.op = m_op; + caller.par0.host_init(m_param[0], grid_size, block_size); + caller.par1.host_init(m_param[1], grid_size, block_size); + (*fptr)<<>>(caller, size); + after_kernel_launch(); + } - public: - UserOpInvokerBinaryIByteHelper(const ElemwiseOpParamN<2>& param, - cudaStream_t stream, const Op& op) - : m_rw_size(param.size), - m_param(param), - m_stream(stream), - m_op(op) { - m_invoked = false; - if (!try_vect_load_store_contiguous() && !try_vect_load_store()) { - dispatch0(); - } - megdnn_assert(m_invoked); +public: + UserOpInvokerBinaryIByteHelper(const ElemwiseOpParamN<2>& param, + cudaStream_t stream, const Op& op) + : m_rw_size(param.size), + m_param(param), + m_stream(stream), + m_op(op) { + m_invoked = false; + if (!try_vect_load_store_contiguous() && !try_vect_load_store()) { + dispatch0(); } - }; + megdnn_assert(m_invoked); + } +}; #define INST_DT_IBYTE(ctype) \ template \ @@ -1201,20 +1185,20 @@ DEFINE_BRDCAST_DISPATCH_RECEIVERS(cb_header, cb_dispatch, const Op& op) \ : Super{param, stream, op} {} \ } - INST_DT_IBYTE(dt_int8); - INST_DT_IBYTE(dt_uint8); - INST_DT_IBYTE(dt_qint8); - INST_DT_IBYTE(dt_quint8); +INST_DT_IBYTE(dt_int8); +INST_DT_IBYTE(dt_uint8); +INST_DT_IBYTE(dt_qint8); +INST_DT_IBYTE(dt_quint8); #undef INST_DT_IBYTE #endif #undef DEFINE_BRDCAST_DISPATCH_RECEIVERS #undef DEFINE_VECT_BRDCAST_DISPATCH_RECEIVERS - /* f}}} */ +/* f}}} */ #undef devfunc -} // namespace elemwise_intl +} // namespace elemwise_intl /*! * \brief general element-wise kernel launcher @@ -1236,10 +1220,9 @@ void run_elemwise(const ElemwiseOpParamN& param, cudaStream_t stream, const Op& op = Op()); #if MEGDNN_CC_CUDA -template -void run_elemwise( - const ElemwiseOpParamN ¶m, cudaStream_t stream, - const Op &op) { +template +void run_elemwise(const ElemwiseOpParamN& param, cudaStream_t stream, + const Op& op) { param.assert_initialized(); elemwise_intl::UserOpInvoker(param, stream, op); } @@ -1248,15 +1231,13 @@ void run_elemwise( * \brief explicit instantialization of run_elemwise for given template params; * used in .cu files, so corresponding run_elemwise can be called from .cpp */ -#define INST_RUN_ELEMWISE(Op, ctype, arity) \ -template void run_elemwise( \ - const ElemwiseOpParamN&, cudaStream_t, const Op&) +#define INST_RUN_ELEMWISE(Op, ctype, arity) \ + template void run_elemwise( \ + const ElemwiseOpParamN&, cudaStream_t, const Op&) #endif -} // namespace cuda -} // namespace megdnn +} // namespace cuda +} // namespace megdnn // vim: ft=cpp syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} - - diff --git a/dnn/test/cuda/elemwise.cpp b/dnn/test/cuda/elemwise.cpp index 1ae79d3b0a669cafa0b01bd6078079e5dc67a9cf..d74afa37dd6ab46b88609e667f64488e9e218de5 100644 --- a/dnn/test/cuda/elemwise.cpp +++ b/dnn/test/cuda/elemwise.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "test/common/elemwise.h" @@ -26,66 +27,61 @@ using namespace test; #define cudnn_check(e) megdnn_assert((e) == CUDNN_STATUS_SUCCESS) namespace { - __attribute__((unused)) - cudnnTensorDescriptor_t make_cudnn_tensor_desc(const TensorLayout &ly) { - megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous()); - int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1}; - for (size_t i = 0; i < ly.ndim; ++ i) { - dim[i] = ly.shape[i]; - stride[i] = ly.stride[i]; - } - cudnnTensorDescriptor_t ret; - cudnn_check(cudnnCreateTensorDescriptor(&ret)); - // cudnn requires tensors to be at-least 4D - cudnn_check(cudnnSetTensor4dDescriptorEx(ret, - CUDNN_DATA_FLOAT, - dim[0], dim[1], dim[2], dim[3], - stride[0], stride[1], stride[2], stride[3])); - - return ret; +__attribute__((unused)) cudnnTensorDescriptor_t make_cudnn_tensor_desc( + const TensorLayout& ly) { + megdnn_assert(ly.ndim && ly.ndim <= 4 && ly.is_contiguous()); + int dim[4] = {1, 1, 1, 1}, stride[4] = {1, 1, 1, 1}; + for (size_t i = 0; i < ly.ndim; ++i) { + dim[i] = ly.shape[i]; + stride[i] = ly.stride[i]; } + cudnnTensorDescriptor_t ret; + cudnn_check(cudnnCreateTensorDescriptor(&ret)); + // cudnn requires tensors to be at-least 4D + cudnn_check(cudnnSetTensor4dDescriptorEx(ret, CUDNN_DATA_FLOAT, dim[0], + dim[1], dim[2], dim[3], stride[0], + stride[1], stride[2], stride[3])); + + return ret; +} - void run_tensor_add( - Handle *handle_cuda, - const TensorND &a, const TensorND &b, - const TensorND &c) { +void run_tensor_add(Handle* handle_cuda, const TensorND& a, const TensorND& b, + const TensorND& c) { #if 1 - cudnnHandle_t cudnn_handle; - cudnn_check(cudnnCreate(&cudnn_handle)); - cuda_check(cudaDeviceSynchronize()); - cuda_check(cudaMemcpy(c.raw_ptr, a.raw_ptr, a.layout.span().dist_byte(), - cudaMemcpyDeviceToDevice)); - - auto bdesc = make_cudnn_tensor_desc(b.layout), - cdesc = make_cudnn_tensor_desc(c.layout); - - float alpha = 1, beta = 1; - cudaProfilerStart(); - cudnn_check(cudnnAddTensor(cudnn_handle, - &alpha, bdesc, b.raw_ptr, - &beta, cdesc, c.raw_ptr)); - cudaProfilerStop(); - - cudnn_check(cudnnDestroyTensorDescriptor(cdesc)); - cudnn_check(cudnnDestroyTensorDescriptor(bdesc)); - cudnn_check(cudnnDestroy(cudnn_handle)); - - cuda_check(cudaMemset(c.raw_ptr, 0, c.layout.span().dist_byte())); - cuda_check(cudaDeviceSynchronize()); + cudnnHandle_t cudnn_handle; + cudnn_check(cudnnCreate(&cudnn_handle)); + cuda_check(cudaDeviceSynchronize()); + cuda_check(cudaMemcpy(c.raw_ptr, a.raw_ptr, a.layout.span().dist_byte(), + cudaMemcpyDeviceToDevice)); + + auto bdesc = make_cudnn_tensor_desc(b.layout), + cdesc = make_cudnn_tensor_desc(c.layout); + + float alpha = 1, beta = 1; + cudaProfilerStart(); + cudnn_check(cudnnAddTensor(cudnn_handle, &alpha, bdesc, b.raw_ptr, &beta, + cdesc, c.raw_ptr)); + cudaProfilerStop(); + + cudnn_check(cudnnDestroyTensorDescriptor(cdesc)); + cudnn_check(cudnnDestroyTensorDescriptor(bdesc)); + cudnn_check(cudnnDestroy(cudnn_handle)); + + cuda_check(cudaMemset(c.raw_ptr, 0, c.layout.span().dist_byte())); + cuda_check(cudaDeviceSynchronize()); #endif - auto opr = handle_cuda->create_operator(); - opr->param().mode = ElemwiseForward::Mode::ADD; - cudaProfilerStart(); - opr->exec({a, b}, c); - cudaProfilerStop(); - } + auto opr = handle_cuda->create_operator(); + opr->param().mode = ElemwiseForward::Mode::ADD; + cudaProfilerStart(); + opr->exec({a, b}, c); + cudaProfilerStop(); +} -} // anonymous namespace +} // anonymous namespace -template -class CUDA_ELEMWISE: public CUDA { -}; +template +class CUDA_ELEMWISE : public CUDA {}; TYPED_TEST_CASE(CUDA_ELEMWISE, elemwise::test_types); TYPED_TEST(CUDA_ELEMWISE, run) { elemwise::run_test(this->handle_cuda()); @@ -275,18 +271,17 @@ TEST_F(CUDA, ELEMWISE_BFLOAT16) { //! the memory of this test case is too large, sometimes will fail on tx1 TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) { - constexpr size_t A = 256 * 1024 * 64, - S0 = 16, S1 = 256, S2 = 64, S3 = 64; + constexpr size_t A = 256 * 1024 * 64, S0 = 16, S1 = 256, S2 = 64, S3 = 64; static_assert(A == S0 * S1 * S2 * S3, "bad value"); - SyncedTensor<> - t0(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}), - t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}); + SyncedTensor<> t0(handle_cuda(), + {TensorShape{S0, S1, S2, S3}, dtype::Float32()}), + t1(handle_cuda(), {TensorShape{S0, S1, S2, S3}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); - run_tensor_add(handle_cuda(), - t0.tensornd_dev(), t0.tensornd_dev(), t1.tensornd_dev()); + run_tensor_add(handle_cuda(), t0.tensornd_dev(), t0.tensornd_dev(), + t1.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(); - for (size_t i = 0; i < A; ++ i) { + for (size_t i = 0; i < A; ++i) { ASSERT_EQ(p0[i] + p0[i], p1[i]) << "at index " << i << "/" << A; } } @@ -294,19 +289,19 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_DENSE) { #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) { constexpr size_t A = 511, B = 509, C0 = 23, C1 = 23, C = C0 * C1; - SyncedTensor<> - t0(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}), - t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}), - t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}); + SyncedTensor<> t0(handle_cuda(), + {TensorShape{A, B, C0, C1}, dtype::Float32()}), + t1(handle_cuda(), {TensorShape{1, B, 1, 1}, dtype::Float32()}), + t2(handle_cuda(), {TensorShape{A, B, C0, C1}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); - run_tensor_add(handle_cuda(), - t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); + run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), + t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); - for (size_t i = 0; i < A; ++ i) { - for (size_t j = 0; j < B; ++ j) { - for (size_t k = 0; k < C; ++ k) { + for (size_t i = 0; i < A; ++i) { + for (size_t j = 0; j < B; ++j) { + for (size_t k = 0; k < C; ++k) { auto off = i * B * C + j * C + k; ASSERT_EQ(p0[off] + p1[j], p2[off]); } @@ -317,16 +312,16 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_101) { TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) { constexpr size_t A = 11583, B = 11587; SyncedTensor<> t0(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}), - t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}), - t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}); + t1(handle_cuda(), {TensorShape{1, B}, dtype::Float32()}), + t2(handle_cuda(), {TensorShape{A, B}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); - run_tensor_add(handle_cuda(), - t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); + run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), + t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); - for (size_t i = 0; i < A; ++ i) { - for (size_t j = 0; j < B; ++ j) { + for (size_t i = 0; i < A; ++i) { + for (size_t j = 0; j < B; ++j) { auto off = i * B + j; ASSERT_EQ(p0[off] + p1[j], p2[off]); } @@ -336,16 +331,16 @@ TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_10) { TEST_F(CUDA, ELEMWISE_BENCHMARK_BCAST_01) { constexpr size_t A = 11583, B = 11587; SyncedTensor<> t0(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}), - t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}), - t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}); + t1(handle_cuda(), {TensorShape{1, A, 1}, dtype::Float32()}), + t2(handle_cuda(), {TensorShape{1, A, B}, dtype::Float32()}); UniformFloatRNG rng{-2.f, 2.f}; rng.gen(t0.tensornd_host()); rng.gen(t1.tensornd_host()); - run_tensor_add(handle_cuda(), - t0.tensornd_dev(), t1.tensornd_dev(), t2.tensornd_dev()); + run_tensor_add(handle_cuda(), t0.tensornd_dev(), t1.tensornd_dev(), + t2.tensornd_dev()); auto p0 = t0.ptr_host(), p1 = t1.ptr_host(), p2 = t2.ptr_host(); - for (size_t i = 0; i < A; ++ i) { - for (size_t j = 0; j < B; ++ j) { + for (size_t i = 0; i < A; ++i) { + for (size_t j = 0; j < B; ++j) { auto off = i * B + j; ASSERT_EQ(p0[off] + p1[i], p2[off]); } @@ -361,8 +356,9 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) { .set_param(Mode::FUSE_ADD_RELU) .set_dtype(0, dtype::Int8()) .set_dtype(1, dtype::Int8()); - auto time = bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) / - nr_times; + auto time = + bencher.execs({{N * C * H * W + 1}, {N * C * H * W + 1}, {}}) / + nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (3.0 * (N * C * H * W + 1)) / (time * 1e6)); time = bencher.execs({{N, C / 4, H, W, 4}, {N, C / 4, H, W, 4}, {}}) / @@ -395,7 +391,6 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_IBYTE) { nr_times; printf("time = %.2fms, bandwidth = %.2fGB/s\n", time, (C + 2.0 * N * C * H * W) / (time * 1e6)); - }; run_bench(256, 256, 56, 56); } @@ -428,4 +423,3 @@ TEST_F(CUDA, BENCHMARK_ELEMWISE_MIN_MAX) { #endif // vim: syntax=cpp.doxygen -