diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h index 2898a11fd7a60165dbf3306045e2a0b304dd0f04..0c751a939b2147754c439f26d89df45d24f1d390 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h @@ -54,8 +54,8 @@ class MPTypeTrait { }; /** - * @brief will be used in BlockYReduce, get the index of reduce_num in shared - * memory + * @brief Will be used in BlockYReduce, get the index of reduce_num in shared + * memory. */ __device__ __forceinline__ int SharedMemoryIndex(int index) { return (threadIdx.y + index) * blockDim.x + threadIdx.x; @@ -83,7 +83,7 @@ __device__ __forceinline__ T WarpReduce(T val, ReduceOp reducer) { */ /** - * @brief BlockXReduce reduce along blockDim.x + * @brief BlockXReduce reduce along blockDim.x. */ template __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { @@ -115,7 +115,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) { } /** - * @brief BlockYReduce reduce along blockDim.y + * @brief BlockYReduce reduce along blockDim.y. */ template __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { @@ -135,24 +135,33 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { } // namespace details /** - * @brief unary function - * @param - * T: data type of in - * OutT: data type of out - * NX: the cols of in - * NY: the rows of in - * BlockSize: the config of this device - * OpFunc: compute functor which have an operator() as following - * template + * @brief Perform unary calculation according to OpFunc. Size of input and + * output are the same. + * + * @template paraments + * InT: Data type of in. + * OutT: Data type of out. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * OpFunc: Compute functor which has an operator() as following: + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const T& a) const { + * HOSTDEVICE OutT operator()(const InT& a) const { * return ...; * } * }; + * + * @param: + * out: The register pointer of out, the size is NX * NY. + * in: The register pointer of in, the size is NX * NY. + * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in, +__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; idx++) { @@ -161,25 +170,35 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in, } /** - * @brief binary function, in1 and in2 have same shape - * @param - * T: data type of in1, in2 - * OutT: data type of out - * NX: the cols of in1, in2 - * NY: the rows of in1, in2 - * BlockSize: the config of this device - * OpFunc: compute functor which have an operator() as following - * template + * @brief Binary calculation according to OpFunc. Size of The input and output + * are the same. + * + * @template paraments + * InT: Data type of in1 and in2. + * OutT: Data type of out. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * OpFunc: Compute functor which has an operator() as following: + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const T& a, const T& b) const { + * HOSTDEVICE OutT operator()(const InT& a, const InT& b) const { * return ...; * } * }; + * + * @param: + * out: The register pointer of out, the size is NX * NY. + * in1: The register pointer of fist input, size is NX * NY. + * in2: The register pointer of second input, size is NX * NY. + * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1, - const T* in2, +__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, + const InT* in2, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { @@ -188,25 +207,38 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1, } /** - * @brief ternary function, in1, in2 and in3 have same shape - * @param - * T: data type of in1, in2, in3 - * OutT: data type of out - * NX: the cols of in1, in2 - * NY: the rows of in1, in2 - * BlockSize: the config of this device - * OpFunc: compute functor which have an operator() as following - * template + * @brief Ternary calculation according to OpFunc. Size of input and output + * are the same. + * + * @template paraments + * InT: Data type of in1 and in2. + * OutT: Data type of out. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * OpFunc: Compute functor which has an operator() as following + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const T& a, const T& b, const T& c) const { + * HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c) + * const { * return ...; * } * }; + * + * @param + * out: The register pointer of out, the size is NX * NY. + * in1: The register pointer of fist input, size is NX * NY. + * in2: The register pointer of second input, size is NX * NY. + * in3: The register pointer of third input, size is NX * NY. + * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1, - const T* in2, const T* in3, +__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, + const InT* in2, + const InT* in3, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { @@ -215,27 +247,36 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const T* in1, } /** - * @brief a general function for elementwise computation, all inputs have - * the same shape. - * @param - * T: data type of in1, in2, in3 - * OutT: data type of out - * NX: the cols of in1, in2 - * NY: the rows of in1, in2 - * BlockSize: the config of this device - * OpFunc: compute functor which have an operator() as following - * template + * @brief Multivariate calculation according to OpFunc. Size of input and output + * are the same. + * + * @template paraments + * InT: Data type of in1 and in2. + * OutT: Data type of out. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * Arity: The size of ins + * OpFunc: Compute functor which has an operator() as following: + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const T* args) const { + * HOSTDEVICE OutT operator()(const InT* args) const { * return ...; * } * }; + * + * @param + * out: The register pointer of out, the size is NX * NY. + * ins: An array of pointers consisting of multiple inputs. + * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY], +__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], OpFunc compute) { - T args[Arity]; + InT args[Arity]; #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { #pragma unroll @@ -247,15 +288,31 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, T (*ins)[NX * NY], } /** - * @brief cycle binary function, in1's shape size is [1, NX], in2's shape size - * is [NY, NX], out's shape size is [NY, NX] + * @brief Binary calculation according to OpFunc. Shape of in1 and in2 are the + * different. Shape of in1 is [1, NX], but in2's shape is [NY, NX], the output + * shape is [NY, NX]. + * + * @template paraments + * InT: Data type of in1 and in2. + * OutT: Data type of out. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * OpFunc: Compute functor which has an operator() as following + * template + * struct XxxFunctor { + * HOSTDEVICE OutT operator()(const InT& a, const InT& b) const { + * return ...; + * } + * }; + * * @param - * T: data type of in1, in2 - * OutT: data type of out - * NX: the cols of in1, in2 - * NY: the rows of in1, in2 - * BlockSize: the config of this device - * OpFunc: compute functor eg: in1 + in2, in1 - in2 + * out: The register pointer of out, the size is NX * NY. + * in1: The register pointer of fist input, size is NX * 1. + * in2: The register pointer of second input, size is NX * NY. + * compute: Compute function which was declared like OpFunc(). */ template @@ -272,26 +329,37 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1, } /** - * @brief reduce function, in's shape size is [NX, NY]. - * If ReduceMode == kLocalMode then reduce NX, the shape of out is [NY, 1], - * if ReduceMode == kGlobalMode then reduce between different threads, the - * shape of out is [NY, NX]. If reduce_last_dim is false and reduce_num was - * split, BlockYReduce will be called. If reduce_last_dim is true and - * reduce_num was split, BlockXReduce will be called - * @typename - * T: data type of in - * NX: the cols of in - * NY: the rows of in - * BlockSize: the config of this device - * OpFunc: reduce functor, eg: CustomSum, CustomMean in reduce_functor_op.h - * @param: - * reducer: reduce functor, eg: CustomSum() - * reduce_last_dim: if in's last dim need to be reduce then reduce_last_dim = - * true + * @brief The Reduce provides collective methods for computing a parallel + * reduction of items partitioned across a CUDA block and intra thread. When + * ReduceMode == kLocalMode, thread reduce along nx. When ReduceMode == + * kGlobalMode, use shared memory to reduce between threads. + * + * @template paraments + * T: The type of data. + * NX: The number of data continuously loaded by each thread. + * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * ReduceFunctor: Compute functor which has an operator() as following + * template + * struct ReduceFunctor { + * HOSTDEVICE OutT operator()(const InT& a, const InT& b) const { + * return ...; + * } + * }; + * ReduceMode: Reduce mode, can be kLocalMode, kGlobalMode. + * + * @param + * out: The register pointer of out, the size is NX * NY. + * in: The register pointer of in, the size is NX * NY. + * reducer: Compute function which was declared like ReduceFunctor(). + * reduce_last_dim: if the last dim gets involved in reduction. */ -template -__device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer, +__device__ __forceinline__ void Reduce(T* out, const T* in, + ReduceFunctor reducer, bool reduce_last_dim) { int block_index = blockDim.y; @@ -302,7 +370,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer, if (block_reduce_y) { #pragma unroll for (int i = 0; i < NY * NX; i++) { // reduce along blockdim.y - out[i] = details::BlockYReduce(out[i], reducer); + out[i] = details::BlockYReduce(out[i], reducer); } } @@ -310,7 +378,7 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, OpFunc reducer, if (reduce_last_dim) { #pragma unroll for (int i = 0; i < NY * NX; i++) { // reduce along blockDim.x - out[i] = details::BlockXReduce(out[i], reducer); + out[i] = details::BlockXReduce(out[i], reducer); } } } else { // else kLocalMode diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h index 3932ba1502ecb44e90454aa32591cb4ffca62cd2..c720bedf0a3afcfd903af7f3bf0c97791774e5ec 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h @@ -32,7 +32,13 @@ template struct alignas(sizeof(T) * VecSize) VectorType { T val[VecSize]; }; - +/** + * Fast division : Replace division in CUDA with multiplication to improve + * kernel performance. + * 1. Complete the division calculation on the CPU, and record the calculation + * results by using the divider and shift_val. + * 2. Set the divisor on the GPU through Div() to complete the calculation. + */ struct FastDivMod { // 1st value represents the result of input number divides by recorded divisor // 2nd value represents the result of input number modulo by recorded divisor @@ -71,6 +77,11 @@ struct FastDivMod { uint32_t multiplier; }; +/** + * Configuration of broadcast. Calculate the input data index according to the + * index of the output data. if input or output shape is [dim0, dim1] then dims + * must be [dim1, dim0]. + */ template struct BroadcastConfig { FastDivMod divmoders[kDims]; @@ -107,65 +118,31 @@ struct BroadcastConfig { } // namespace details /** - * @brief load data from src to dst, src can be 1D data or 2D data. Note that - * you can use this function when you are sure that the data will not cross the - * boundary. - * @typename: - * Tx: data type of src - * Ty: data type of dstt - * NX: the cols of src, dst - * NY: the rows of src, dst - * BlockSize: the config of this device - * @param: - * stride_nx: the stride of cols - * stride_ny: the stride of rows - */ - -template -__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, - int stride_nx, int stride_ny) { - int thread_offset = threadIdx.x * NX; - - if (NY == 1 && NX == 1) { - dst[0] = static_cast(src[thread_offset]); - } else if (NX == 1) { -#pragma unroll - for (int idy = 0; idy < NY; ++idy) { - dst[idy] = static_cast(src[thread_offset + idy * stride_ny]); - } - } else if (NY == 1) { -#pragma unroll - for (int idx = 0; idx < NX; ++idx) { - dst[idx] = static_cast(src[thread_offset + idx * stride_nx]); - } - } else { -#pragma unroll - for (int idx = 0; idx < NX; ++idx) { -#pragma unroll - for (int idy = 0; idy < NY; ++idy) { - dst[idy * NX + idx] = static_cast( - src[thread_offset + idx * stride_nx + idy * stride_ny]); - } - } - } -} - -/** - * @brief load data from src to dst with stride, src can be 1D data or 2D data. - * When boundary judgment is required, you need to set a to true, and a is false - * by default. - * @typename: - * Tx: data type of src - * Ty: data type of dstt - * NX: the cols of src, dst - * NY: the rows of src, dst - * BlockSize: the config of this device - * IsBoundary: whether to make boundary judgment + * @brief Read 2D data from global memory to registers according to Tx type, and + * store it as Ty type. + * + * @template paraments + * Tx: The type of data stored in the global memory. + * Ty: The type of data that needs to be stored in registers. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim, boundary judgment is required to avoid memory access + * crossing the boundary. + * * @param: - * size_nx: number of columns to be processed by the current block - * size_ny: number of rows to be processed by the current block - * stride_nx: the stride of cols - * stride_ny: the stride of rows + * dst: The register pointer of the thread, the size is NX * NY. + * src: Data pointer of the current block. + * size_nx: The current block needs to load size_nx columns of data, this + * parameter will be used when IsBoundary = true. + * size_ny: The current block needs to load size_ny rows of data. This parameter + * will be used when IsBoundary = true. + * stride_nx: The stride of cols. + * stride_ny: The stride of rows. */ template @@ -226,6 +203,17 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, } } +/** + * @brief Initialize register with init_data. + * + * @template paraments + * T: Data type of register. + * NX: Number of data to initialize. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * init_data: Initial value. + */ template __device__ __forceinline__ void Init(T* dst, T init_data) { #pragma unroll @@ -234,18 +222,27 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { } } -/** @brief: ReadData - * @brief load data from src to dst, src can be 1D data, you should set NY = 1. - * When boundary judgment is required, you need to set a to true, and a is false - * by default. - * @typename: - * T : the data type of src - * NX: the cols of src, dst - * NY: in this function NY only can be 1 - * BlockSize: the config of this device - * IsBoundary: whether to make boundary judgment +/** + * @brief Read 2D data from global memory to registers. When IsBoundary = true + * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to + * improve memory access efficiency. + * + * @template paraments + * T: Data type of src and dst. + * NX: The number of data continuously loaded by each thread. + * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. + * When the number of data processed by this block is less than + * NX x NY x blockDim, boundary judgment is required to avoid memory access + * crossing the boundary. + * * @param: - * num: number of columns to be processed by the current block + * dst: The register pointer of the thread, the size is NX * NY. + * src: Data pointer of the current block. + * size: The current block needs to load size data continuously. */ template __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, @@ -279,28 +276,38 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, } /** - * @brief: read data for broadcast - * @typename: - * T : the data type of src - * NX: the cols of src, dst - * NY: in this function NY only can be 1 - * BlockSize: the config of this device - * ShapeSize: the shape size of out. eg in[1, 35], out[32, 35] then shape size - * is 2 - * IsBoundary: whether to make boundary judgment + * @brief Read 2D data from global memory to registers for broadcast. + * + * @template paraments + * T: The type of data stored in the global memory. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim, boundary judgment is required to avoid memory access + * crossing the boundary. + * * @param: - * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX; - * config: get the global index in src, attention config was declared in host; - * total_num_output: total num of output - * stride_nx: the stride of cols - * stride_ny: the stride of rows + * dst: The register pointer of the thread, the size is NX * NY. + * src: Raw input data pointer of kernel. + * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. Please + * refer to the sample code for specific usage. + * total_num_output: Total number of original output. + * stride_nx: The stride of cols. + * stride_ny: The stride of rows. */ -template __device__ __forceinline__ void ReadDataBc( T* dst, const T* __restrict__ src, uint32_t block_offset, - details::BroadcastConfig config, int total_num_output, - int stride_nx, int stride_ny) { + details::BroadcastConfig config, int total_num_output, int stride_nx, + int stride_ny) { uint32_t thread_offset = block_offset + threadIdx.x * NX; uint32_t index_src = 0; @@ -316,7 +323,7 @@ __device__ __forceinline__ void ReadDataBc( } } #pragma unroll - for (int i = 0; i < ShapeSize; ++i) { + for (int i = 0; i < Rank; ++i) { auto fast_divmoder = config.divmoders[i].Divmod(index_output); index_output = fast_divmoder.val[0]; index_src += fast_divmoder.val[1] * config.strides[i]; @@ -327,27 +334,41 @@ __device__ __forceinline__ void ReadDataBc( } /** - * @brief: read data for broadcast - * @typename: - * T : the data type of src - * NX: the cols of src, dst - * NY: in this function NY only can be 1 - * BlockSize: the config of this device - * ShapeSize: the shape size of out. eg in[1, 35], out[32, 35] then shape size - * is 2 - * IndexCal: get the global index in src, attention config was declared in host; - * IsBoundary: whether to make boundary judgment + * @brief Read 2D data from global memory to registers for reduce. + * + * @template paraments + * T: The type of data stored in the global memory. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim, boundary judgment is required to avoid memory access + * crossing the boundary. + * * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: Raw input data pointer of kernel. + * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * index_cal: Calculation configuration of Reduce. It is used to calculate the + * coordinate mapping relationship between output data and input data. Please + * refer to the sample code for specific usage. * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX; * index_cal: get the global index in src, attention config was declared in * host; - * size_nx: number of columns to be processed by the current block - * size_ny: number of rows to be processed by the current block - * stride_nx: the stride of cols - * stride_ny: the stride of rows - * reduce_last_dim: according to the block split set threadIdx + * size_nx: The current block needs to load size_nx columns of data, this + * parameter will be used when IsBoundary = true. + * size_ny: The current block needs to load size_ny rows of data. This parameter + * will be used when IsBoundary = true. + * stride_nx: The stride of cols. + * stride_ny: The stride of rows. + * reduce_last_dim: Used to indicate whether the dimension of reduce contains + * the lowest dimension. */ -template __device__ __forceinline__ void ReadDataReduce( T* dst, const T* __restrict__ src, int block_offset, @@ -397,17 +418,26 @@ __device__ __forceinline__ void ReadDataReduce( } /** - * @brief store data from src to dst, src can be 1D data, you should set NY = 1. - * When boundary judgment is required, you need to set a to true, and a is false - * by default. - * @typename: - * T : the data type of src - * NX: the cols of src, dst - * NY: in this function NY only can be 1 - * BlockSize: the config of this device - * IsBoundary: whether to make boundary judgment + * @brief Write 2D data from registers to global memory. When IsBoundary = true + * and (NX % 4 == 0 or Nx % 2 == 0), the data will be vectorized to improve the + * data loading efficiency + * + * @template paraments + * T: The type of data. + * NX: The number of data continuously loaded by each thread. + * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index, and for xpu, core_id() is used as + * the index. Currently only GPU was supported. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim, boundary judgment is required to avoid memory access + * crossing the boundary. + * * @param: - * num: number of columns to be processed by the current block + * dst: Data pointer of the current block. + * src: The register pointer of the thread, the size is NX * NY. + * size: The current block needs to load size data continuously. */ template __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,