From 255bf609e5d9289dfc6d5122e7fda746c933b6e2 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Tue, 1 Mar 2022 18:48:02 +0800 Subject: [PATCH] Add function description for Kernel Primitive API (#39884) * Add function description for Kernel Primitive API 1. Set cumsum and sort share memory size = 1024 2.sort and cumsum api limitation : blockDim.x must be less than 512 (blockDim.x <= 512) --- .../kernels/primitive/compute_primitives.h | 284 +++++++++++++----- .../primitive/compute_primitives_xpu2.h | 23 ++ .../kernels/primitive/datamover_primitives.h | 32 ++ .../primitive/datamover_primitives_xpu2.h | 41 +++ 4 files changed, 311 insertions(+), 69 deletions(-) diff --git a/paddle/phi/kernels/primitive/compute_primitives.h b/paddle/phi/kernels/primitive/compute_primitives.h index 4f3c069f3b2..19427551fb3 100644 --- a/paddle/phi/kernels/primitive/compute_primitives.h +++ b/paddle/phi/kernels/primitive/compute_primitives.h @@ -136,7 +136,9 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { return shared_memory[threadIdx.x]; } -// Swap data +/** + * @brief Swap data + */ template __device__ __forceinline__ void Swap(T* first_value, T* second_value) { T t_value; @@ -145,7 +147,9 @@ __device__ __forceinline__ void Swap(T* first_value, T* second_value) { (*second_value) = t_value; } -// swap with monotonic_type +/** + * @brief Swap data according to monotonic_type. + */ template __device__ __forceinline__ void Comparator(T* first_value, T* second_value, @@ -155,6 +159,9 @@ __device__ __forceinline__ void Comparator(T* first_value, } } +/** + * @brief Swap data and data index according to monotonic_type. + */ template __device__ __forceinline__ void ComparatorWithIndex(T* first_value, @@ -170,6 +177,18 @@ __device__ __forceinline__ void ComparatorWithIndex(T* first_value, } } +/** + * @brief get the last pow of 2 + */ +__device__ inline int GetLastPow2(int n) { + n |= (n >> 1); + n |= (n >> 2); + n |= (n >> 4); + n |= (n >> 8); + n |= (n >> 16); + return std::max(1, n - (n >> 1)); +} + } // namespace details /** @@ -453,6 +472,29 @@ __device__ __forceinline__ void Reduce(T* out, } } +/* +* @brief Fill register with a constant according to OpFunc +* +* @template paraments +* InT: The data type of in1 and in2. +* OutT: The data type of out. +* NX: The number of data columns loaded by each thread. +* NY: The number of data rows loaded by each thread. +* BlockSize: Identifies the current device thread index method. Currently only +* GPU was supported. +* OpFunc: Compute functor which has an operator() as following +* template +* struct XxxFunctor { +* HOSTDEVICE InT operator()() +* const { +* return a; +* } +* }; +* +* @param +* out: The register pointer of out, the size is NX * NY. +* compute: Compute function which was declared like OpFunc(). +*/ template +* struct XxxFunctor { +* HOSTDEVICE InT operator()(StateType state) +* const { +* return ranomd(state); // Returns ReturnsCount random numbers with +* data type T +* } +* }; +* +* @param +* out: The register pointer of out, the size is NX * NY. +* compute: Compute function which was declared like OpFunc(). +*/ + template +/* +* @brief Complete the prefix and in the block, each thread calculates 2 data, +* the size of out and in is 2, and BlockDim.x must be less then 512. +* +* @template paraments +* InT: the type of input register. +* OutT: the type of out register. +* BlockSize: Identifies the current device thread index method. Currently only +* GPU was supported. +* OpFunc: Compute functor which has an operator() as following +* template +* struct XxxFunctor { +* HOSTDEVICE InT operator()(T a, T b) +* const { +* return a + b; +* } +* }; +* +* @param +* out: The register pointer of out, the size is 2; +* in: The register pointer of input, the size is 2; +* compute: Compute function which was declared like OpFunc(). +*/ + +#define SHARED_SIZE_LIMIT 512 +template __device__ __forceinline__ void Cumsum(OutT* out, const InT* in, OpFunc compute) { - __shared__ InT temp[shared_size * 2 + (shared_size * 2) / 32]; + constexpr int kSize = SHARED_SIZE_LIMIT * 2 + (SHARED_SIZE_LIMIT * 2) / 32; + __shared__ InT temp[kSize]; + int stride_size = blockDim.x; int tidx = threadIdx.x; temp[tidx + tidx / 32] = in[0]; - temp[shared_size + tidx + (shared_size + tidx) / 32] = in[1]; - for (int stride = 1; stride <= blockDim.x; stride *= 2) { + temp[stride_size + tidx + (stride_size + tidx) / 32] = in[1]; + for (int stride = 1; stride <= stride_size; stride *= 2) { __syncthreads(); int index = (tidx + 1) * 2 * stride - 1; if (index < (blockDim.x * 2)) { - temp[index + index / 32] += temp[index - stride + (index - stride) / 32]; + temp[index + index / 32] = + compute(temp[index + index / 2], + temp[index - stride + (index - stride) / 32]); } } for (int stride = (blockDim.x * 2) / 4; stride > 0; stride /= 2) { __syncthreads(); int index = (tidx + 1) * 2 * stride - 1; if ((index + stride) < (blockDim.x * 2)) { - temp[index + stride + (stride + index) / 32] += - temp[index + (index) / 32]; + temp[index + stride + (stride + index) / 32] = + compute(temp[index + stride + (stride + index) / 32], + temp[index + (index) / 32]); } } __syncthreads(); out[0] = static_cast(temp[tidx + tidx / 32]); out[1] = - static_cast(temp[tidx + shared_size + (tidx + shared_size) / 32]); + static_cast(temp[tidx + stride_size + (tidx + stride_size) / 32]); } - -#define SHARED_SIZE_LIMIT \ - 1024 // each thread load 2 data from global memory so SHARED_SIZE_LIMIT must - // larger than blockDim.x * 2 -// if monotonic_type = 1 then increase -// if gridDim.x > 1 please set monotonic_type = blockIdx.x & 1; blockIdx.x % 2 -// == 1 the increase -template -__device__ __forceinline__ void Sort(T* dst, - const T* src_data, +#undef SHARED_SIZE_LIMIT + +/* +* @brief Sort data in this block, each thread calculates 2 data, the size of out +* and in is 2, and BlockDim.x must be less then 512. +* +* @template paraments +* InT: the type of input register. +* OutT: the type of out register. +* BlockSize: Identifies the current device thread index method. Currently only +* GPU was supported. +* +* @param +* out: The register pointer of out, the size is 2. +* in: The register pointer of input, the size is 2. +* num: The num of this block +* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles +* sorted in escending. +*/ +#define SHARED_SIZE_LIMIT 1024 +// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must +// larger than blockDim.x * 2 +template +__device__ __forceinline__ void Sort(OutT* out, + const InT* in, int num, int monotonic_type) { - // todo: set num = Pow2(num) + int upper_bound = blockDim.x; + // update upper_bound + upper_bound = std::min(details::GetLastPow2(num), upper_bound); // shareMem for value and index num must smaller than SHARED_SIZE_LIMIT / 2 - __shared__ T value[SHARED_SIZE_LIMIT]; // shareMem's size must larger than - // blockDim * 2 - // Copy value and index from src and src_index - value[threadIdx.x] = src_data[0]; - value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1]; + __shared__ InT value[SHARED_SIZE_LIMIT]; + int stride_size = blockDim.x; + // shareMem's size must larger than blockDim * 2 + // Copy value from in + value[threadIdx.x] = in[0]; + value[threadIdx.x + stride_size] = in[1]; // make bitonicSort - for (int size = 2; size < num; size <<= 1) { + for (int size = 2; size < upper_bound; size <<= 1) { int bitonic_type = (threadIdx.x & (size / 2)) != 0; for (int stride = size / 2; stride > 0; stride >>= 1) { __syncthreads(); int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - details::Comparator(&value[pos], &value[pos + stride], bitonic_type); + details::Comparator(&value[pos], &value[pos + stride], bitonic_type); } } // last sort - for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { + for (int stride = stride_size; stride > 0; stride >>= 1) { __syncthreads(); int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); // last sort when monotonic_type = 1 then increase - details::Comparator(&value[pos], &value[pos + stride], monotonic_type); + details::Comparator(&value[pos], &value[pos + stride], monotonic_type); } __syncthreads(); - dst[0] = value[threadIdx.x]; - dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + out[0] = static_cast(value[threadIdx.x]); + out[1] = static_cast(value[threadIdx.x + stride_size]); } -template -__device__ __forceinline__ void Sort(T* dst, - IndexType* dst_index, - const T* src_data, - IndexType* src_index, +/* +* @brief Sort data with data_index in this block, each thread calculates 2 data, +* the size of out and in is 2, and BlockDim.x must be less then 512. +* +* @template paraments +* InT: The type of input register. +* OutT: The type of out register. +* IndexType: The type of index. +* BlockSize: Identifies the current device thread index method. Currently only +* GPU was supported. +* +* @param +* out: The register pointer of out, the size is 2. +* out_index: The register pointer of out_index, the size is 2. +* in: The register pointer of input, the size is 2. +* in_index: The register pointer of in_index, the size is 2. +* num: The num of this block. +* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles +* sorted in escending. +*/ +template +__device__ __forceinline__ void Sort(OutT* out, + IndexType* out_index, + const InT* in, + IndexType* in_index, int num, int monotonic_type) { - // todo: set num = Pow2(num) + int upper_bound = blockDim.x; + // update upper_bound + upper_bound = std::min(details::GetLastPow2(num), upper_bound); // shareMem for value and index num must smaller than SHARED_SIZE_LIMIT / 2 - __shared__ T value[SHARED_SIZE_LIMIT]; // shareMem's size must larger than - // blockDim * 2 + __shared__ InT value[SHARED_SIZE_LIMIT]; + // shareMem's size must larger than blockDim * 2 __shared__ IndexType index[SHARED_SIZE_LIMIT]; - // Copy value and index from src and src_index - value[threadIdx.x] = src_data[0]; - value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_data[1]; + // Copy value and index from in and in_index + int stride_size = blockDim.x; + value[threadIdx.x] = in[0]; + value[threadIdx.x + stride_size] = in[1]; // index - index[threadIdx.x] = src_index[0]; - index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)] = src_index[1]; + index[threadIdx.x] = in_index[0]; + index[threadIdx.x + stride_size] = in_index[1]; // make bitonicSort - for (int size = 2; size < num; size <<= 1) { + for (int size = 2; size < upper_bound; size <<= 1) { int bitonic_type = (threadIdx.x & (size / 2)) != 0; for (int stride = size / 2; stride > 0; stride >>= 1) { __syncthreads(); int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); - details::ComparatorWithIndex(&value[pos], - &value[pos + stride], - &index[pos], - &index[pos + stride], - bitonic_type); + details::ComparatorWithIndex(&value[pos], + &value[pos + stride], + &index[pos], + &index[pos + stride], + bitonic_type); } } - for (int stride = SHARED_SIZE_LIMIT / 2; stride > 0; stride >>= 1) { + for (int stride = stride_size; stride > 0; stride >>= 1) { __syncthreads(); int pos = 2 * threadIdx.x - (threadIdx.x & (stride - 1)); // last sort when monotonic_type = 1 then increase - details::ComparatorWithIndex(&value[pos], - &value[pos + stride], - &index[pos], - &index[pos + stride], - monotonic_type); + details::ComparatorWithIndex(&value[pos], + &value[pos + stride], + &index[pos], + &index[pos + stride], + monotonic_type); } __syncthreads(); - dst[0] = value[threadIdx.x]; - dst[1] = value[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; - dst_index[0] = index[threadIdx.x]; - dst_index[1] = index[threadIdx.x + (SHARED_SIZE_LIMIT / 2)]; + out[0] = static_cast(value[threadIdx.x]); + out[1] = static_cast(value[threadIdx.x + stride_size]); + out_index[0] = index[threadIdx.x]; + out_index[1] = index[threadIdx.x + stride_size]; +} + +template +HOSTDEVICE __forceinline__ void OperatorTernary( + OutT* out, const T1* in1, const T2* in2, OpFunc func, int num) { + func(out, in1, in2, num); +} + +template +HOSTDEVICE __forceinline__ void OperatorBinary(OutT* out, + const InT* in, + OpFunc func, + int num) { + func(out, in, num); } } // namespace kps diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h index a445f4a02ea..1f4ef2ed932 100644 --- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h @@ -348,6 +348,29 @@ __device__ __forceinline__ void Reduce(T* out, } } +/* +* @brief Fill register with a constant according to OpFunc +* +* @template paraments +* InT: The data type of in1 and in2. +* OutT: The data type of out. +* NX: The number of data columns loaded by each thread. +* NY: The number of data rows loaded by each thread. +* BlockSize: Identifies the current device thread index method. For xpu, +* core_id() is used as the index. +* OpFunc: Compute functor which has an operator() as following +* template +* struct XxxFunctor { +* HOSTDEVICE InT operator()() +* const { +* return a; +* } +* }; +* +* @param +* out: The register pointer of out, the size is NX * NY. +* compute: Compute function which was declared like OpFunc(). +*/ template or std::tuple + * Index: The index of data stored in dst. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. + * When the number of data processed by this block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: The data pointer of the current block. + * size: The current block needs to load size data continuously. */ template __device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) { int thread_offset = block_offset + threadIdx.x * NX; diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h index 75b2dbaf7e6..53a8b7d0c9e 100644 --- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h +++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h @@ -244,6 +244,24 @@ __device__ __inline__ void ReadData(T* dst, /** * @brief Read 1D data from global memory to register. The difference * from the above function is that it supports different data types of inputs. + * + * @template paraments + * T: The type of data. + * NX: Each thread load NX data from global memory continuously. + * NY: Each thread need to load NY rows, only NY = 1 was supported. + * ArgsT: The Type if dst, ArgsT can be std::tuple or std::tuple + * Index: The index of data stored in dst. + * BlockSize: Identifies the current device thread index method. For xpu, + * core_id() is used as the index. + * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. + * When the number of data processed by this block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: The data pointer of the current block. + * size: The current block needs to load size data continuously. */ template +__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) { + int thread_offset = block_offset + core_id() * NX; +#pragma unroll + for (int nx = 0; nx < NX; ++nx) { + dst[nx] = static_cast(thread_offset + nx); + } +} + } // namespace kps } // namespace phi -- GitLab