未验证 提交 6840cf55 编写于 作者: N niuliling123 提交者: GitHub

Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1 (#36373) (#36616)

* Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1
* Update the implement of reduceAnyKernel according to kernel primitive api
上级 30909889
...@@ -171,7 +171,7 @@ __device__ __forceinline__ void LoadData( ...@@ -171,7 +171,7 @@ __device__ __forceinline__ void LoadData(
// num: how many data will be deal with in this time // num: how many data will be deal with in this time
if (need_broadcast) { if (need_broadcast) {
kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(dst, src, block_offset, kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(dst, src, block_offset,
config, numel, 1, 1); config, numel);
} else { } else {
kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num); kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
} }
......
...@@ -76,14 +76,14 @@ __global__ void BroadcastKernelBinary( ...@@ -76,14 +76,14 @@ __global__ void BroadcastKernelBinary(
// load in0 // load in0
if (use_broadcast[0]) { if (use_broadcast[0]) {
kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>( kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
arg0, in0, fix, configlists[0], numel, 1, 1); arg0, in0, fix, configlists[0], numel);
} else { } else {
kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num); kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num);
} }
// load in1 // load in1
if (use_broadcast[1]) { if (use_broadcast[1]) {
kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>( kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
arg1, in1, fix, configlists[1], numel, 1, 1); arg1, in1, fix, configlists[1], numel);
} else { } else {
kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num); kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num);
} }
......
...@@ -135,17 +135,16 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { ...@@ -135,17 +135,16 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
} // namespace details } // namespace details
/** /**
* @brief Perform unary calculation according to OpFunc. Size of input and * @brief Perform unary calculation according to OpFunc. Shape of input and
* output are the same. * output are the same.
* *
* @template paraments * @template paraments
* InT: Data type of in. * InT: The data type of in.
* OutT: Data type of out. * OutT: The data type of out.
* NX: The number of data columns loaded by each thread. * NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread. * NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following: * OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT> * template <typename InT, typename OutT>
* struct XxxFunctor { * struct XxxFunctor {
...@@ -170,21 +169,20 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, ...@@ -170,21 +169,20 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
} }
/** /**
* @brief Binary calculation according to OpFunc. Size of The input and output * @brief Binary calculation according to OpFunc. Shape of The input and output
* are the same. * are the same.
* *
* @template paraments * @template paraments
* InT: Data type of in1 and in2. * InT: The data type of in1 and in2.
* OutT: Data type of out. * OutT: The data type of out.
* NX: The number of data columns loaded by each thread. * NX: The number of data columns computed by each thread.
* NY: The number of data rows loaded by each thread. * NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following: * OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT> * template <typename InT>
* struct XxxFunctor { * struct XxxFunctor {
* HOSTDEVICE OutT operator()(const InT& a, const InT& b) const { * HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
* return ...; * return ...;
* } * }
* }; * };
...@@ -193,7 +191,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, ...@@ -193,7 +191,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* out: The register pointer of out, the size is NX * NY. * out: The register pointer of out, the size is NX * NY.
* in1: The register pointer of fist input, size is NX * NY. * in1: The register pointer of fist input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT, typename OutT, int NX, int NY, int BlockSize,
class OpFunc> class OpFunc>
...@@ -207,21 +205,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, ...@@ -207,21 +205,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
} }
/** /**
* @brief Ternary calculation according to OpFunc. Size of input and output * @brief Ternary calculation according to OpFunc. Shape of input and output
* are the same. * are the same.
* *
* @template paraments * @template paraments
* InT: Data type of in1 and in2. * InT: The data type of in1 and in2.
* OutT: Data type of out. * OutT: The data type of out.
* NX: The number of data columns loaded by each thread. * NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread. * NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following * OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT> * template <typename InT>
* struct XxxFunctor { * struct XxxFunctor {
* HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c) * HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c)
* const { * const {
* return ...; * return ...;
* } * }
...@@ -232,7 +229,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, ...@@ -232,7 +229,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in1: The register pointer of fist input, size is NX * NY. * in1: The register pointer of fist input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY. * in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT, typename OutT, int NX, int NY, int BlockSize,
class OpFunc> class OpFunc>
...@@ -247,30 +244,29 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, ...@@ -247,30 +244,29 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
} }
/** /**
* @brief Multivariate calculation according to OpFunc. Size of input and output * @brief Multivariate calculation according to OpFunc. Shape of inputs and
* are the same. * output are the same.
* *
* @template paraments * @template paraments
* InT: Data type of in1 and in2. * InT: The data type of in1, in2 and in3.
* OutT: Data type of out. * OutT: The data type of out.
* NX: The number of data columns loaded by each thread. * NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread. * NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported. * Arity: The size of ins.
* Arity: The size of ins
* OpFunc: Compute functor which has an operator() as following: * OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT> * template <typename InT>
* struct XxxFunctor { * struct XxxFunctor {
* HOSTDEVICE OutT operator()(const InT* args) const { * HOSTDEVICE InT operator()(const InT* args) const {
* return ...; * return ...;
* } * }
* }; * };
* *
* @param * @param
* out: The register pointer of out, the size is NX * NY. * out: The register pointer of out, the size is NX * NY.
* ins: An array of pointers consisting of multiple inputs. * ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT, OutT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity, template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
class OpFunc> class OpFunc>
...@@ -293,13 +289,12 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], ...@@ -293,13 +289,12 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* shape is [NY, NX]. * shape is [NY, NX].
* *
* @template paraments * @template paraments
* InT: Data type of in1 and in2. * InT: The data type of in1 and in2.
* OutT: Data type of out. * OutT: The data type of out.
* NX: The number of data columns loaded by each thread. * NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread. * NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following * OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT> * template <typename InT, typename OutT>
* struct XxxFunctor { * struct XxxFunctor {
...@@ -339,8 +334,7 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1, ...@@ -339,8 +334,7 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
* NX: The number of data continuously loaded by each thread. * NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported. * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* ReduceFunctor: Compute functor which has an operator() as following * ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT> * template <typename InT>
* struct ReduceFunctor { * struct ReduceFunctor {
......
...@@ -118,8 +118,8 @@ struct BroadcastConfig { ...@@ -118,8 +118,8 @@ struct BroadcastConfig {
} // namespace details } // namespace details
/** /**
* @brief Read 2D data from global memory to registers according to Tx type, and * @brief Read 2D data from global memory to register according to Tx type, and
* store it as Ty type. * store it as Ty type into register.
* *
* @template paraments * @template paraments
* Tx: The type of data stored in the global memory. * Tx: The type of data stored in the global memory.
...@@ -127,8 +127,7 @@ struct BroadcastConfig { ...@@ -127,8 +127,7 @@ struct BroadcastConfig {
* NX: The number of data columns loaded by each thread. * NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread. * NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds * IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than * judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access * NX x NY x blockDim, boundary judgment is required to avoid memory access
...@@ -136,20 +135,20 @@ struct BroadcastConfig { ...@@ -136,20 +135,20 @@ struct BroadcastConfig {
* *
* @param: * @param:
* dst: The register pointer of the thread, the size is NX * NY. * dst: The register pointer of the thread, the size is NX * NY.
* src: Data pointer of the current block. * src: The data pointer of the current block.
* size_nx: The current block needs to load size_nx columns of data, this * size_nx: The maximum offset of the current block is size_nx elements in the
* parameter will be used when IsBoundary = true. * lowest dimension. The parameters are only calculated when isboundary = true.
* size_ny: The current block needs to load size_ny rows of data. This parameter * size_ny: The maximum offset of the current block is size_ny elements in the
* will be used when IsBoundary = true. * first dimension. The parameters are only calculated when isboundary = true.
* stride_nx: The stride of cols. * stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: The stride of rows. * stride_ny: Each read one element stride stride_ny elements in the first dim.
*/ */
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
int size_nx, int size_ny, int size_nx, int size_ny,
int stride_nx, int stride_ny) { int stride_nx, int stride_ny) {
int thread_offset = threadIdx.x * NX; int thread_offset = threadIdx.x;
int left_size_nx = size_nx - thread_offset; int left_size_nx = size_nx - thread_offset;
// Each branch is added for better performance // Each branch is added for better performance
...@@ -165,7 +164,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, ...@@ -165,7 +164,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
#pragma unroll #pragma unroll
for (int idy = 0; idy < NY; ++idy) { for (int idy = 0; idy < NY; ++idy) {
if (IsBoundary) { if (IsBoundary) {
if (idy >= size_ny) { if (idy * stride_ny >= size_ny) {
break; break;
} }
} }
...@@ -175,7 +174,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, ...@@ -175,7 +174,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX; ++idx) { for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) { if (IsBoundary) {
if (idx >= left_size_nx) { if (idx * stride_nx >= left_size_nx) {
break; break;
} }
} }
...@@ -185,14 +184,14 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, ...@@ -185,14 +184,14 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX; ++idx) { for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) { if (IsBoundary) {
if (idx >= left_size_nx) { if (idx * stride_nx >= left_size_nx) {
break; break;
} }
} }
#pragma unroll #pragma unroll
for (int idy = 0; idy < NY; ++idy) { for (int idy = 0; idy < NY; ++idy) {
if (IsBoundary) { if (IsBoundary) {
if (idy >= size_ny) { if (idy * stride_ny >= size_ny) {
break; break;
} }
} }
...@@ -223,25 +222,24 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { ...@@ -223,25 +222,24 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
} }
/** /**
* @brief Read 2D data from global memory to registers. When IsBoundary = true * @brief Read 1D data from global memory to register. When IsBoundary = true
* and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
* improve memory access efficiency. * improve memory access efficiency.
* *
* @template paraments * @template paraments
* T: Data type of src and dst. * T: The type of data.
* NX: The number of data continuously loaded by each thread. * NX: Each thread load NX data from global memory continuously.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported. * NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory. * IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than * When the number of data processed by this block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary. * crossing the boundary.
* *
* @param: * @param:
* dst: The register pointer of the thread, the size is NX * NY. * dst: The register pointer of the thread, the size is NX * NY.
* src: Data pointer of the current block. * src: The data pointer of the current block.
* size: The current block needs to load size data continuously. * size: The current block needs to load size data continuously.
*/ */
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false> template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
...@@ -276,31 +274,29 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, ...@@ -276,31 +274,29 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
} }
/** /**
* @brief Read 2D data from global memory to registers for broadcast. * @brief Read 2D data from global memory to registers with broadcast form.
* *
* @template paraments * @template paraments
* T: The type of data stored in the global memory. * T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread. * NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread. * NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds * IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than * judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary. * crossing the boundary.
* *
* @param: * @param:
* dst: The register pointer of the thread, the size is NX * NY. * dst: The register pointer of the thread, the size is NX * NY.
* src: Raw input data pointer of kernel. * src: The original input data pointer of this kernel.
* block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX.
* config: Calculation configuration of broadcast. It is used to calculate the * config: Calculation configuration of broadcast. It is used to calculate the
* coordinate mapping relationship between output data and input data. Please * coordinate mapping relationship between output data and input data.
* refer to the sample code for specific usage.
* total_num_output: Total number of original output. * total_num_output: Total number of original output.
* stride_nx: The stride of cols. * stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: The stride of rows. * stride_ny: Each read one element stride stride_ny elements in the first dim.
*/ */
template <typename T, int NX, int NY, int BlockSize, int Rank, template <typename T, int NX, int NY, int BlockSize, int Rank,
bool IsBoundary = false> bool IsBoundary = false>
...@@ -308,7 +304,7 @@ __device__ __forceinline__ void ReadDataBc( ...@@ -308,7 +304,7 @@ __device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset, T* dst, const T* __restrict__ src, uint32_t block_offset,
details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx, details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
int stride_ny) { int stride_ny) {
uint32_t thread_offset = block_offset + threadIdx.x * NX; uint32_t thread_offset = block_offset + threadIdx.x;
uint32_t index_src = 0; uint32_t index_src = 0;
#pragma unroll #pragma unroll
...@@ -334,37 +330,33 @@ __device__ __forceinline__ void ReadDataBc( ...@@ -334,37 +330,33 @@ __device__ __forceinline__ void ReadDataBc(
} }
/** /**
* @brief Read 2D data from global memory to registers for reduce. * @brief Read 2D data from global memory to register with reduce form.
* *
* @template paraments * @template paraments
* T: The type of data stored in the global memory. * T: The type of data.
* NX: The number of data columns loaded by each thread. * NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread. * NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds * IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than * judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary. * crossing the boundary.
* *
* @param: * @param:
* dst: The register pointer of the thread, the size is NX * NY. * dst: The register pointer of the thread, the size is NX * NY.
* src: Raw input data pointer of kernel. * src: The input data pointer of this block.
* block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX.
* index_cal: Calculation configuration of Reduce. It is used to calculate the * index_cal: Calculation configuration of Reduce. It is used to calculate the
* coordinate mapping relationship between output data and input data. Please * coordinate mapping relationship between output data and input data.
* refer to the sample code for specific usage.
* block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
* index_cal: get the global index in src, attention config was declared in
* host;
* size_nx: The current block needs to load size_nx columns of data, this * size_nx: The current block needs to load size_nx columns of data, this
* parameter will be used when IsBoundary = true. * parameter will participate in the calculation when isboundary = true.
* size_ny: The current block needs to load size_ny rows of data. This parameter * size_ny: The current block needs to load size_ny rows of data, this parameter
* will participate in the calculation when isboundary = true.
* will be used when IsBoundary = true. * will be used when IsBoundary = true.
* stride_nx: The stride of cols. * stride_nx: Each read one element stride stride_nx columns.
* stride_ny: The stride of rows. * stride_ny: Each read one element stride stride_ny raws.
* reduce_last_dim: Used to indicate whether the dimension of reduce contains * reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension. * the lowest dimension.
*/ */
...@@ -375,10 +367,13 @@ __device__ __forceinline__ void ReadDataReduce( ...@@ -375,10 +367,13 @@ __device__ __forceinline__ void ReadDataReduce(
const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
int stride_ny, bool reduce_last_dim) { int stride_ny, bool reduce_last_dim) {
int thread_offset = 0; int thread_offset = 0;
int left_idx = 0;
if (reduce_last_dim) { if (reduce_last_dim) {
thread_offset = block_offset + threadIdx.x; thread_offset = threadIdx.x;
left_idx = threadIdx.y;
} else { } else {
thread_offset = block_offset + threadIdx.y; thread_offset = threadIdx.y;
left_idx = threadIdx.x;
} }
if (NX == 1) { if (NX == 1) {
...@@ -389,30 +384,25 @@ __device__ __forceinline__ void ReadDataReduce( ...@@ -389,30 +384,25 @@ __device__ __forceinline__ void ReadDataReduce(
break; break;
} }
} }
uint32_t index_src = index_cal(thread_offset); uint32_t index_src = index_cal(thread_offset + block_offset);
dst[ny] = src[index_src]; dst[ny] = src[index_src];
thread_offset += stride_ny; thread_offset += stride_ny;
} }
} else { } else {
#pragma unroll #pragma unroll
for (int nx = 0; nx < NX; ++nx) { for (int nx = 0; nx < NX; ++nx) {
if (IsBoundary) {
if (nx * stride_nx >= size_nx) {
break;
}
}
#pragma unroll #pragma unroll
for (int ny = 0; ny < NY; ++ny) { for (int ny = 0; ny < NY; ++ny) {
if (IsBoundary) { if (IsBoundary) {
if (nx * stride_nx >= size_nx) { if ((thread_offset >= size_ny) ||
(left_idx + nx * stride_nx >= size_nx)) {
break; break;
} }
} }
uint32_t index_src = index_cal(thread_offset); uint32_t index_src = index_cal(thread_offset + block_offset);
dst[nx + ny * NX] = src[index_src]; dst[nx + ny * NX] = src[index_src];
thread_offset += stride_ny; thread_offset += stride_ny;
} }
thread_offset += stride_nx;
} }
} }
} }
...@@ -424,20 +414,19 @@ __device__ __forceinline__ void ReadDataReduce( ...@@ -424,20 +414,19 @@ __device__ __forceinline__ void ReadDataReduce(
* *
* @template paraments * @template paraments
* T: The type of data. * T: The type of data.
* NX: The number of data continuously loaded by each thread. * NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported. * NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU, * BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as * threadIdx.x is used as the thread index. Currently only GPU was supported.
* the index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds * IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than * judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access * NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary. * crossing the boundary.
* *
* @param: * @param:
* dst: Data pointer of the current block. * dst: The data pointer of the current block.
* src: The register pointer of the thread, the size is NX * NY. * src: The register pointer, the size is NX * NY.
* size: The current block needs to load size data continuously. * size: The current block needs to load size elements continuously.
*/ */
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false> template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
...@@ -467,6 +456,165 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, ...@@ -467,6 +456,165 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
} }
} }
/**
* @brief Write 2D data from register to global memory according to Tx type, and
* store it as Ty type.
*
* @template paraments
* Tx: The type of data that needs to be stored in registers.
* Ty: The type of data that stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: The data pointer of the current block.
* src: The register pointer of the thread, the size is NX * NY.
* size_nx: The maximum offset of the current block is size_nx elements in the
* lowest dimension. The parameters are only calculated when isboundary = true.
* size_ny: The maximum offset of the current block is size_ny elements in the
* first dimension. The parameters are only calculated when isboundary = true.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
bool IsBoundary = false>
__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src,
int size_nx, int size_ny,
int stride_nx, int stride_ny) {
int thread_offset = threadIdx.x;
int left_size_nx = size_nx - thread_offset;
// Each branch is added for better performance
if (NX == 1 && NY == 1) { // for NX == 1 and NY == 1
if (IsBoundary) {
if (left_size_nx > 0) {
dst[thread_offset] = static_cast<Ty>(src[0]);
}
} else {
dst[thread_offset] = static_cast<Ty>(src[0]);
}
} else if (NX == 1) { // for NX == 1 and NY != 1
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
if (IsBoundary) {
if (idy * stride_ny >= size_ny) {
break;
}
}
dst[thread_offset + idy * stride_ny] = static_cast<Ty>(src[idy]);
}
} else if (NY == 1) { // for NY == 1 and NX != 1
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) {
if (idx * stride_nx >= left_size_nx) {
break;
}
}
dst[thread_offset + idx * stride_nx] = static_cast<Ty>(src[idx]);
}
} else { // for NX != 1 and NY != 1
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) {
if (idx * stride_nx >= left_size_nx) {
break;
}
}
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
if (IsBoundary) {
if (idy * stride_ny >= size_ny) {
break;
}
}
dst[thread_offset + idx * stride_nx + idy * stride_ny] =
static_cast<Ty>(src[idy * NX + idx]);
}
}
}
}
/**
* @brief Initialize register with init_data.
*
* @template paraments
* T: Data type of register.
* NX: Number of data to initialize.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
template <typename T, int NX, bool IsBoundary = false>
__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
#pragma unroll
for (int i = 0; i < NX; i++) {
if (IsBoundary) {
if (i >= num) {
break;
}
}
dst[i] = init_data[i];
}
}
/**
* @brief Read 1D data from global memory to register with broadcast form.
*
* @template paraments
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: The register pointer of the thread, the size is NX * NY.
* src: The original input data pointer of kernel.
* block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX;
* config: Calculation configuration of broadcast. It is used to calculate the
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template <typename T, int NX, int NY, int BlockSize, int Rank,
bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset,
details::BroadcastConfig<Rank> config, int total_num_output) {
uint32_t thread_offset = block_offset + threadIdx.x * NX;
uint32_t index_src = 0;
#pragma unroll
for (uint32_t nx = 0; nx < NX; ++nx) {
uint32_t index_output = thread_offset + nx;
index_src = 0;
if (IsBoundary) {
if (index_output >= total_num_output) {
break;
}
}
#pragma unroll
for (int i = 0; i < Rank; ++i) {
auto fast_divmoder = config.divmoders[i].Divmod(index_output);
index_output = fast_divmoder.val[0];
index_src += fast_divmoder.val[1] * config.strides[i];
}
dst[nx] = src[index_src];
}
}
} // namespace kernel_primitives } // namespace kernel_primitives
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -529,6 +529,31 @@ __device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer, ...@@ -529,6 +529,31 @@ __device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer,
kps::WriteData<Ty, 1, 1, 1, IsBoundary>(y + store_offset, &temp_data, size); kps::WriteData<Ty, 1, 1, 1, IsBoundary>(y + store_offset, &temp_data, size);
} }
template <typename Tx, typename MPType, typename ReduceOp, typename TransformOp,
typename Calculator, bool IsBoundary>
__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var,
ReduceOp reducer, TransformOp transformer,
MPType init, int reduce_num, int input_idx,
bool reduce_last_dim,
const Calculator& reduce_index_calculator,
int stride, int num) {
Tx input_reg[REDUCE_VEC_SIZE];
MPType input_compute[REDUCE_VEC_SIZE];
MPType input_transform[REDUCE_VEC_SIZE];
kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator, IsBoundary>(
&input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num,
1, stride, reduce_last_dim);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
&input_transform[0], &input_reg[0], transformer);
kps::Init<MPType, REDUCE_VEC_SIZE, IsBoundary>(input_compute, input_transform,
num);
kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
reduce_var, &input_compute[0], reducer, reduce_last_dim);
}
// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
// function will be used // function will be used
...@@ -570,37 +595,17 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer, ...@@ -570,37 +595,17 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer,
// 1. reduce for each thread // 1. reduce for each thread
if (left_idx < left_num) { if (left_idx < left_num) {
// load REDUCE_VEC_SIZE data once, and then compute // load REDUCE_VEC_SIZE data once, and then compute
Tx input_reg[REDUCE_VEC_SIZE];
MPType input_compute[REDUCE_VEC_SIZE];
int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
for (; input_idx + block_size < bound; for (; input_idx + block_size < bound;
input_idx += REDUCE_VEC_SIZE * stride) { input_idx += REDUCE_VEC_SIZE * stride) {
kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator>( ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, false>(
&input_reg[0], input, input_idx, reduce_index_calculator, 1, input, &reduce_var, reducer, transformer, init, reduce_num, input_idx,
reduce_num, 1, stride, reduce_last_dim); reduce_last_dim, reduce_index_calculator, stride, reduce_num);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
&input_compute[0], &input_reg[0], transformer);
kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &input_compute[0], reducer, reduce_last_dim);
}
kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator, true>(
&input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num,
1, stride, reduce_last_dim);
input_idx += tid;
#pragma unroll
for (int i = 0; i < REDUCE_VEC_SIZE; ++i) {
if (input_idx >= reduce_num) {
break;
}
input_compute[i] = static_cast<MPType>(transformer(input_reg[i]));
input_idx += stride;
} }
kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp, int num = (reduce_num - input_idx - tid + stride - 1) / stride;
kps::details::ReduceMode::kLocalMode>( ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, true>(
&reduce_var, &input_compute[0], reducer, reduce_last_dim); input, &reduce_var, reducer, transformer, init, reduce_num - input_idx,
input_idx, reduce_last_dim, reduce_index_calculator, stride, num);
} }
kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>( kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册