未验证 提交 bc0df48b 编写于 作者: N niuliling123 提交者: GitHub

[icafe-31094] Add function comments and instructions to the Primitive API (#35743)

上级 372a1a75
...@@ -54,8 +54,8 @@ class MPTypeTrait<platform::float16> { ...@@ -54,8 +54,8 @@ class MPTypeTrait<platform::float16> {
}; };
/** /**
* @brief will be used in BlockYReduce, get the index of reduce_num in shared * @brief Will be used in BlockYReduce, get the index of reduce_num in shared
* memory * memory.
*/ */
__device__ __forceinline__ int SharedMemoryIndex(int index) { __device__ __forceinline__ int SharedMemoryIndex(int index) {
return (threadIdx.y + index) * blockDim.x + threadIdx.x; return (threadIdx.y + index) * blockDim.x + threadIdx.x;
...@@ -83,7 +83,7 @@ __device__ __forceinline__ T WarpReduce(T val, ReduceOp reducer) { ...@@ -83,7 +83,7 @@ __device__ __forceinline__ T WarpReduce(T val, ReduceOp reducer) {
*/ */
/** /**
* @brief BlockXReduce reduce along blockDim.x * @brief BlockXReduce reduce along blockDim.x.
*/ */
template <typename T, typename ReduceOp> template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
...@@ -115,7 +115,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { ...@@ -115,7 +115,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
} }
/** /**
* @brief BlockYReduce reduce along blockDim.y * @brief BlockYReduce reduce along blockDim.y.
*/ */
template <typename T, typename ReduceOp> template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
...@@ -135,24 +135,33 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { ...@@ -135,24 +135,33 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
} // namespace details } // namespace details
/** /**
* @brief unary function * @brief Perform unary calculation according to OpFunc. Size of input and
* @param * output are the same.
* T: data type of in *
* OutT: data type of out * @template paraments
* NX: the cols of in * InT: Data type of in.
* NY: the rows of in * OutT: Data type of out.
* BlockSize: the config of this device * NX: The number of data columns loaded by each thread.
* OpFunc: compute functor which have an operator() as following * NY: The number of data rows loaded by each thread.
* template <typename T, typename OutT> * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
* struct XxxFunctor { * struct XxxFunctor {
* HOSTDEVICE OutT operator()(const T& a) const { * HOSTDEVICE OutT operator()(const InT& a) const {
* return ...; * return ...;
* } * }
* }; * };
*
* @param:
* out: The register pointer of out, the size is NX * NY.
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/ */
template <typename T, typename OutT, int NX, int NY, int BlockSize, template <typename InT, typename OutT, int NX, int NY, int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in, __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
OpFunc compute) { OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX * NY; idx++) { for (int idx = 0; idx < NX * NY; idx++) {
...@@ -161,25 +170,35 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in, ...@@ -161,25 +170,35 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in,
} }
/** /**
* @brief binary function, in1 and in2 have same shape * @brief Binary calculation according to OpFunc. Size of The input and output
* @param * are the same.
* T: data type of in1, in2 *
* OutT: data type of out * @template paraments
* NX: the cols of in1, in2 * InT: Data type of in1 and in2.
* NY: the rows of in1, in2 * OutT: Data type of out.
* BlockSize: the config of this device * NX: The number of data columns loaded by each thread.
* OpFunc: compute functor which have an operator() as following * NY: The number of data rows loaded by each thread.
* template <typename T, typename OutT> * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
* struct XxxFunctor { * struct XxxFunctor {
* HOSTDEVICE OutT operator()(const T& a, const T& b) const { * HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
* return ...; * return ...;
* } * }
* }; * };
*
* @param:
* out: The register pointer of out, the size is NX * NY.
* in1: The register pointer of fist input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/ */
template <typename T, typename OutT, int NX, int NY, int BlockSize, template <typename InT, typename OutT, int NX, int NY, int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1, __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
const T* in2, const InT* in2,
OpFunc compute) { OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) { for (int idx = 0; idx < NX * NY; ++idx) {
...@@ -188,25 +207,38 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1, ...@@ -188,25 +207,38 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1,
} }
/** /**
* @brief ternary function, in1, in2 and in3 have same shape * @brief Ternary calculation according to OpFunc. Size of input and output
* @param * are the same.
* T: data type of in1, in2, in3 *
* OutT: data type of out * @template paraments
* NX: the cols of in1, in2 * InT: Data type of in1 and in2.
* NY: the rows of in1, in2 * OutT: Data type of out.
* BlockSize: the config of this device * NX: The number of data columns loaded by each thread.
* OpFunc: compute functor which have an operator() as following * NY: The number of data rows loaded by each thread.
* template <typename T, typename OutT> * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
* struct XxxFunctor { * struct XxxFunctor {
* HOSTDEVICE OutT operator()(const T& a, const T& b, const T& c) const { * HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c)
* const {
* return ...; * return ...;
* } * }
* }; * };
*
* @param
* out: The register pointer of out, the size is NX * NY.
* in1: The register pointer of fist input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/ */
template <typename T, typename OutT, int NX, int NY, int BlockSize, template <typename InT, typename OutT, int NX, int NY, int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1, __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
const T* in2, const T* in3, const InT* in2,
const InT* in3,
OpFunc compute) { OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) { for (int idx = 0; idx < NX * NY; ++idx) {
...@@ -215,27 +247,36 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1, ...@@ -215,27 +247,36 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1,
} }
/** /**
* @brief a general function for elementwise computation, all inputs have * @brief Multivariate calculation according to OpFunc. Size of input and output
* the same shape. * are the same.
* @param *
* T: data type of in1, in2, in3 * @template paraments
* OutT: data type of out * InT: Data type of in1, in2 and in3.
* NX: the cols of in1, in2 * OutT: Data type of out.
* NY: the rows of in1, in2 * NX: The number of data columns loaded by each thread.
* BlockSize: the config of this device * NY: The number of data rows loaded by each thread.
* OpFunc: compute functor which have an operator() as following * BlockSize: Identifies the current device thread index method. For GPU,
* template <typename T, typename OutT> * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* Arity: The size of ins
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
* struct XxxFunctor { * struct XxxFunctor {
* HOSTDEVICE OutT operator()(const T* args) const { * HOSTDEVICE OutT operator()(const InT* args) const {
* return ...; * return ...;
* } * }
* }; * };
*
* @param
* out: The register pointer of out, the size is NX * NY.
* ins: An array of pointers consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/ */
template <typename T, typename OutT, int NX, int NY, int BlockSize, int Arity, template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY], __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
OpFunc compute) { OpFunc compute) {
T args[Arity]; InT args[Arity];
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) { for (int idx = 0; idx < NX * NY; ++idx) {
#pragma unroll #pragma unroll
...@@ -247,20 +288,36 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY], ...@@ -247,20 +288,36 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY],
} }
/** /**
* @brief cycle binary function, in1's shape size is [1, NX], in2's shape size * @brief Binary calculation according to OpFunc. Shape of in1 and in2 are the
* is [NY, NX], out's shape size is [NY, NX] * different. Shape of in1 is [1, NX], but in2's shape is [NY, NX], the output
* shape is [NY, NX].
*
* @template paraments
* InT: Data type of in1 and in2.
* OutT: Data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
* struct XxxFunctor {
* HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
* return ...;
* }
* };
*
* @param * @param
* T: data type of in1, in2 * out: The register pointer of out, the size is NX * NY.
* OutT: data type of out * in1: The register pointer of fist input, size is NX * 1.
* NX: the cols of in1, in2 * in2: The register pointer of second input, size is NX * NY.
* NY: the rows of in1, in2 * compute: Compute function which was declared like OpFunc<InT, OutT>().
* BlockSize: the config of this device
* OpFunc: compute functor eg: in1 + in2, in1 - in2
*/ */
template <typename T, typename OutT, int NX, int NY, int BlockSize, template <typename InT, typename OutT, int NX, int NY, int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void CycleBinary(OutT* out, const T* in1, __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
const T* in2, OpFunc compute) { const InT* in2, OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX; idx++) { for (int idx = 0; idx < NX; idx++) {
#pragma unroll #pragma unroll
...@@ -272,26 +329,37 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1, ...@@ -272,26 +329,37 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
} }
/** /**
* @brief reduce function, in's shape size is [NX, NY]. * @brief The Reduce provides collective methods for computing a parallel
* If ReduceMode == kLocalMode then reduce NX, the shape of out is [NY, 1], * reduction of items partitioned across a CUDA block and intra thread. When
* if ReduceMode == kGlobalMode then reduce between different threads, the * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode ==
* shape of out is [NY, NX]. If reduce_last_dim is false and reduce_num was * kGlobalMode, use shared memory to reduce between threads.
* split, BlockYReduce will be called. If reduce_last_dim is true and *
* reduce_num was split, BlockXReduce will be called * @template paraments
* @typename * T: The type of data.
* T: data type of in * NX: The number of data continuously loaded by each thread.
* NX: the cols of in * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* NY: the rows of in * BlockSize: Identifies the current device thread index method. For GPU,
* BlockSize: the config of this device * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* OpFunc: reduce functor, eg: CustomSum, CustomMean in reduce_functor_op.h * the index. Currently only GPU was supported.
* @param: * ReduceFunctor: Compute functor which has an operator() as following
* reducer: reduce functor, eg: CustomSum<T>() * template <typename InT>
* reduce_last_dim: if in's last dim need to be reduce then reduce_last_dim = * struct ReduceFunctor {
* true * HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
* return ...;
* }
* };
* ReduceMode: Reduce mode, can be kLocalMode, kGlobalMode.
*
* @param
* out: The register pointer of out, the size is NX * NY.
* in: The register pointer of in, the size is NX * NY.
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction.
*/ */
template <typename T, int NX, int NY, int BlockSize, class OpFunc, template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
details::ReduceMode Mode> details::ReduceMode Mode>
__device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer, __device__ __forceinline__ void Reduce(T* out, const T* in,
ReduceFunctor reducer,
bool reduce_last_dim) { bool reduce_last_dim) {
int block_index = blockDim.y; int block_index = blockDim.y;
...@@ -302,7 +370,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer, ...@@ -302,7 +370,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
if (block_reduce_y) { if (block_reduce_y) {
#pragma unroll #pragma unroll
for (int i = 0; i < NY * NX; i++) { // reduce along blockdim.y for (int i = 0; i < NY * NX; i++) { // reduce along blockdim.y
out[i] = details::BlockYReduce<T, OpFunc>(out[i], reducer); out[i] = details::BlockYReduce<T, ReduceFunctor>(out[i], reducer);
} }
} }
...@@ -310,7 +378,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer, ...@@ -310,7 +378,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer,
if (reduce_last_dim) { if (reduce_last_dim) {
#pragma unroll #pragma unroll
for (int i = 0; i < NY * NX; i++) { // reduce along blockDim.x for (int i = 0; i < NY * NX; i++) { // reduce along blockDim.x
out[i] = details::BlockXReduce<T, OpFunc>(out[i], reducer); out[i] = details::BlockXReduce<T, ReduceFunctor>(out[i], reducer);
} }
} }
} else { // else kLocalMode } else { // else kLocalMode
......
...@@ -32,7 +32,13 @@ template <typename T, int VecSize> ...@@ -32,7 +32,13 @@ template <typename T, int VecSize>
struct alignas(sizeof(T) * VecSize) VectorType { struct alignas(sizeof(T) * VecSize) VectorType {
T val[VecSize]; T val[VecSize];
}; };
/**
* Fast division : Replace division in CUDA with multiplication to improve
* kernel performance.
* 1. Complete the division calculation on the CPU, and record the calculation
* results by using the divider and shift_val.
* 2. Set the divisor on the GPU through Div() to complete the calculation.
*/
struct FastDivMod { struct FastDivMod {
// 1st value represents the result of input number divides by recorded divisor // 1st value represents the result of input number divides by recorded divisor
// 2nd value represents the result of input number modulo by recorded divisor // 2nd value represents the result of input number modulo by recorded divisor
...@@ -71,6 +77,11 @@ struct FastDivMod { ...@@ -71,6 +77,11 @@ struct FastDivMod {
uint32_t multiplier; uint32_t multiplier;
}; };
/**
* Configuration of broadcast. Calculate the input data index according to the
* index of the output data. if input or output shape is [dim0, dim1] then dims
* must be [dim1, dim0].
*/
template <int kDims> template <int kDims>
struct BroadcastConfig { struct BroadcastConfig {
FastDivMod divmoders[kDims]; FastDivMod divmoders[kDims];
...@@ -107,65 +118,31 @@ struct BroadcastConfig { ...@@ -107,65 +118,31 @@ struct BroadcastConfig {
} // namespace details } // namespace details
/** /**
* @brief load data from src to dst, src can be 1D data or 2D data. Note that * @brief Read 2D data from global memory to registers according to Tx type, and
* you can use this function when you are sure that the data will not cross the * store it as Ty type.
* boundary. *
* @typename: * @template paraments
* Tx: data type of src * Tx: The type of data stored in the global memory.
* Ty: data type of dstt * Ty: The type of data that needs to be stored in registers.
* NX: the cols of src, dst * NX: The number of data columns loaded by each thread.
* NY: the rows of src, dst * NY: The number of data rows loaded by each thread.
* BlockSize: the config of this device * BlockSize: Identifies the current device thread index method. For GPU,
* @param: * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* stride_nx: the stride of cols * the index. Currently only GPU was supported.
* stride_ny: the stride of rows * IsBoundary: Indicates whether to perform block access storage out-of-bounds
*/ * judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
template <typename Tx, typename Ty, int NX, int NY, int BlockSize> * crossing the boundary.
__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, *
int stride_nx, int stride_ny) {
int thread_offset = threadIdx.x * NX;
if (NY == 1 && NX == 1) {
dst[0] = static_cast<Ty>(src[thread_offset]);
} else if (NX == 1) {
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
dst[idy] = static_cast<Ty>(src[thread_offset + idy * stride_ny]);
}
} else if (NY == 1) {
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
dst[idx] = static_cast<Ty>(src[thread_offset + idx * stride_nx]);
}
} else {
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
dst[idy * NX + idx] = static_cast<Ty>(
src[thread_offset + idx * stride_nx + idy * stride_ny]);
}
}
}
}
/**
* @brief load data from src to dst with stride, src can be 1D data or 2D data.
* When boundary judgment is required, you need to set a to true, and a is false
* by default.
* @typename:
* Tx: data type of src
* Ty: data type of dstt
* NX: the cols of src, dst
* NY: the rows of src, dst
* BlockSize: the config of this device
* IsBoundary: whether to make boundary judgment
* @param: * @param:
* size_nx: number of columns to be processed by the current block * dst: The register pointer of the thread, the size is NX * NY.
* size_ny: number of rows to be processed by the current block * src: Data pointer of the current block.
* stride_nx: the stride of cols * size_nx: The current block needs to load size_nx columns of data, this
* stride_ny: the stride of rows * parameter will be used when IsBoundary = true.
* size_ny: The current block needs to load size_ny rows of data. This parameter
* will be used when IsBoundary = true.
* stride_nx: The stride of cols.
* stride_ny: The stride of rows.
*/ */
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
bool IsBoundary = false> bool IsBoundary = false>
...@@ -226,6 +203,17 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, ...@@ -226,6 +203,17 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
} }
} }
/**
* @brief Initialize register with init_data.
*
* @template paraments
* T: Data type of register.
* NX: Number of data to initialize.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: Initial value.
*/
template <typename T, int NX> template <typename T, int NX>
__device__ __forceinline__ void Init(T* dst, T init_data) { __device__ __forceinline__ void Init(T* dst, T init_data) {
#pragma unroll #pragma unroll
...@@ -234,18 +222,27 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { ...@@ -234,18 +222,27 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
} }
} }
/** @brief: ReadData /**
* @brief load data from src to dst, src can be 1D data, you should set NY = 1. * @brief Read 2D data from global memory to registers. When IsBoundary = true
* When boundary judgment is required, you need to set a to true, and a is false * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
* by default. * improve memory access efficiency.
* @typename: *
* T : the data type of src * @template paraments
* NX: the cols of src, dst * T: Data type of src and dst.
* NY: in this function NY only can be 1 * NX: The number of data continuously loaded by each thread.
* BlockSize: the config of this device * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* IsBoundary: whether to make boundary judgment * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param: * @param:
* num: number of columns to be processed by the current block * dst: The register pointer of the thread, the size is NX * NY.
* src: Data pointer of the current block.
* size: The current block needs to load size data continuously.
*/ */
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false> template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
...@@ -279,28 +276,38 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, ...@@ -279,28 +276,38 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
} }
/** /**
* @brief: read data for broadcast * @brief Read 2D data from global memory to registers for broadcast.
* @typename: *
* T : the data type of src * @template paraments
* NX: the cols of src, dst * T: The type of data stored in the global memory.
* NY: in this function NY only can be 1 * NX: The number of data columns loaded by each thread.
* BlockSize: the config of this device * NY: The number of data rows loaded by each thread.
* ShapeSize: the shape size of out. eg in[1, 35], out[32, 35] then shape size * BlockSize: Identifies the current device thread index method. For GPU,
* is 2 * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* IsBoundary: whether to make boundary judgment * the index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param: * @param:
* block_offset: data offset of this block, blockDim.x * blockIdx.x * NX; * dst: The register pointer of the thread, the size is NX * NY.
* config: get the global index in src, attention config was declared in host; * src: Raw input data pointer of kernel.
* total_num_output: total num of output * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
* stride_nx: the stride of cols * config: Calculation configuration of broadcast. It is used to calculate the
* stride_ny: the stride of rows * coordinate mapping relationship between output data and input data. Please
* refer to the sample code for specific usage.
* total_num_output: Total number of original output.
* stride_nx: The stride of cols.
* stride_ny: The stride of rows.
*/ */
template <typename T, int NX, int NY, int BlockSize, int ShapeSize, template <typename T, int NX, int NY, int BlockSize, int Rank,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc( __device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset, T* dst, const T* __restrict__ src, uint32_t block_offset,
details::BroadcastConfig<ShapeSize> config, int total_num_output, details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
int stride_nx, int stride_ny) { int stride_ny) {
uint32_t thread_offset = block_offset + threadIdx.x * NX; uint32_t thread_offset = block_offset + threadIdx.x * NX;
uint32_t index_src = 0; uint32_t index_src = 0;
...@@ -316,7 +323,7 @@ __device__ __forceinline__ void ReadDataBc( ...@@ -316,7 +323,7 @@ __device__ __forceinline__ void ReadDataBc(
} }
} }
#pragma unroll #pragma unroll
for (int i = 0; i < ShapeSize; ++i) { for (int i = 0; i < Rank; ++i) {
auto fast_divmoder = config.divmoders[i].Divmod(index_output); auto fast_divmoder = config.divmoders[i].Divmod(index_output);
index_output = fast_divmoder.val[0]; index_output = fast_divmoder.val[0];
index_src += fast_divmoder.val[1] * config.strides[i]; index_src += fast_divmoder.val[1] * config.strides[i];
...@@ -327,27 +334,41 @@ __device__ __forceinline__ void ReadDataBc( ...@@ -327,27 +334,41 @@ __device__ __forceinline__ void ReadDataBc(
} }
/** /**
* @brief: read data for broadcast * @brief Read 2D data from global memory to registers for reduce.
* @typename: *
* T : the data type of src * @template paraments
* NX: the cols of src, dst * T: The type of data stored in the global memory.
* NY: in this function NY only can be 1 * NX: The number of data columns loaded by each thread.
* BlockSize: the config of this device * NY: The number of data rows loaded by each thread.
* ShapeSize: the shape size of out. eg in[1, 35], out[32, 35] then shape size * BlockSize: Identifies the current device thread index method. For GPU,
* is 2 * threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* IndexCal: get the global index in src, attention config was declared in host; * the index. Currently only GPU was supported.
* IsBoundary: whether to make boundary judgment * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param: * @param:
* dst: The register pointer of the thread, the size is NX * NY.
* src: Raw input data pointer of kernel.
* block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
* index_cal: Calculation configuration of Reduce. It is used to calculate the
* coordinate mapping relationship between output data and input data. Please
* refer to the sample code for specific usage.
* block_offset: data offset of this block, blockDim.x * blockIdx.x * NX; * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
* index_cal: get the global index in src, attention config was declared in * index_cal: get the global index in src, attention config was declared in
* host; * host;
* size_nx: number of columns to be processed by the current block * size_nx: The current block needs to load size_nx columns of data, this
* size_ny: number of rows to be processed by the current block * parameter will be used when IsBoundary = true.
* stride_nx: the stride of cols * size_ny: The current block needs to load size_ny rows of data. This parameter
* stride_ny: the stride of rows * will be used when IsBoundary = true.
* reduce_last_dim: according to the block split set threadIdx * stride_nx: The stride of cols.
* stride_ny: The stride of rows.
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
*/ */
template <typename T, int NX, int NY, int BlockSize, int ShapeSize, template <typename T, int NX, int NY, int BlockSize, int Rank,
typename IndexCal, bool IsBoundary = false> typename IndexCal, bool IsBoundary = false>
__device__ __forceinline__ void ReadDataReduce( __device__ __forceinline__ void ReadDataReduce(
T* dst, const T* __restrict__ src, int block_offset, T* dst, const T* __restrict__ src, int block_offset,
...@@ -397,17 +418,26 @@ __device__ __forceinline__ void ReadDataReduce( ...@@ -397,17 +418,26 @@ __device__ __forceinline__ void ReadDataReduce(
} }
/** /**
* @brief store data from src to dst, src can be 1D data, you should set NY = 1. * @brief Write 2D data from registers to global memory. When IsBoundary = true
* When boundary judgment is required, you need to set a to true, and a is false * and (NX % 4 == 0 or Nx % 2 == 0), the data will be vectorized to improve the
* by default. * data loading efficiency
* @typename: *
* T : the data type of src * @template paraments
* NX: the cols of src, dst * T: The type of data.
* NY: in this function NY only can be 1 * NX: The number of data continuously loaded by each thread.
* BlockSize: the config of this device * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* IsBoundary: whether to make boundary judgment * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param: * @param:
* num: number of columns to be processed by the current block * dst: Data pointer of the current block.
* src: The register pointer of the thread, the size is NX * NY.
* size: The current block needs to load size data continuously.
*/ */
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false> template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册