未验证 提交 6840cf55 编写于 作者: N niuliling123 提交者: GitHub

Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1 (#36373) (#36616)

* Fix a bug in ReadData, ReadDataBc and ReadDataReduce when NX != 1
* Update the implement of reduceAnyKernel according to kernel primitive api
上级 30909889
......@@ -171,7 +171,7 @@ __device__ __forceinline__ void LoadData(
// num: how many data will be deal with in this time
if (need_broadcast) {
kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(dst, src, block_offset,
config, numel, 1, 1);
config, numel);
} else {
kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
}
......
......@@ -76,14 +76,14 @@ __global__ void BroadcastKernelBinary(
// load in0
if (use_broadcast[0]) {
kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
arg0, in0, fix, configlists[0], numel, 1, 1);
arg0, in0, fix, configlists[0], numel);
} else {
kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num);
}
// load in1
if (use_broadcast[1]) {
kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1, ShapeSize>(
arg1, in1, fix, configlists[1], numel, 1, 1);
arg1, in1, fix, configlists[1], numel);
} else {
kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num);
}
......
......@@ -135,17 +135,16 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
} // namespace details
/**
* @brief Perform unary calculation according to OpFunc. Size of input and
* @brief Perform unary calculation according to OpFunc. Shape of input and
* output are the same.
*
* @template paraments
* InT: Data type of in.
* OutT: Data type of out.
* InT: The data type of in.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
* struct XxxFunctor {
......@@ -170,21 +169,20 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
}
/**
* @brief Binary calculation according to OpFunc. Size of The input and output
* @brief Binary calculation according to OpFunc. Shape of The input and output
* are the same.
*
* @template paraments
* InT: Data type of in1 and in2.
* OutT: Data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* InT: The data type of in1 and in2.
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
* template <typename InT>
* struct XxxFunctor {
* HOSTDEVICE OutT operator()(const InT& a, const InT& b) const {
* HOSTDEVICE InT operator()(const InT& a, const InT& b) const {
* return ...;
* }
* };
......@@ -193,7 +191,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* out: The register pointer of out, the size is NX * NY.
* in1: The register pointer of fist input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
class OpFunc>
......@@ -207,21 +205,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
}
/**
* @brief Ternary calculation according to OpFunc. Size of input and output
* @brief Ternary calculation according to OpFunc. Shape of input and output
* are the same.
*
* @template paraments
* InT: Data type of in1 and in2.
* OutT: Data type of out.
* InT: The data type of in1 and in2.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
* template <typename InT>
* struct XxxFunctor {
* HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c)
* HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c)
* const {
* return ...;
* }
......@@ -232,7 +229,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in1: The register pointer of fist input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
class OpFunc>
......@@ -247,30 +244,29 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
}
/**
* @brief Multivariate calculation according to OpFunc. Size of input and output
* are the same.
* @brief Multivariate calculation according to OpFunc. Shape of inputs and
* output are the same.
*
* @template paraments
* InT: Data type of in1 and in2.
* OutT: Data type of out.
* InT: The data type of in1, in2 and in3.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* Arity: The size of ins
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Arity: The size of ins.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
* template <typename InT>
* struct XxxFunctor {
* HOSTDEVICE OutT operator()(const InT* args) const {
* HOSTDEVICE InT operator()(const InT* args) const {
* return ...;
* }
* };
*
* @param
* out: The register pointer of out, the size is NX * NY.
* ins: An array of pointers consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
class OpFunc>
......@@ -293,13 +289,12 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* shape is [NY, NX].
*
* @template paraments
* InT: Data type of in1 and in2.
* OutT: Data type of out.
* InT: The data type of in1 and in2.
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
* struct XxxFunctor {
......@@ -339,8 +334,7 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1,
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
* struct ReduceFunctor {
......
......@@ -118,8 +118,8 @@ struct BroadcastConfig {
} // namespace details
/**
* @brief Read 2D data from global memory to registers according to Tx type, and
* store it as Ty type.
* @brief Read 2D data from global memory to register according to Tx type, and
* store it as Ty type into register.
*
* @template paraments
* Tx: The type of data stored in the global memory.
......@@ -127,8 +127,7 @@ struct BroadcastConfig {
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
......@@ -136,20 +135,20 @@ struct BroadcastConfig {
*
* @param:
* dst: The register pointer of the thread, the size is NX * NY.
* src: Data pointer of the current block.
* size_nx: The current block needs to load size_nx columns of data, this
* parameter will be used when IsBoundary = true.
* size_ny: The current block needs to load size_ny rows of data. This parameter
* will be used when IsBoundary = true.
* stride_nx: The stride of cols.
* stride_ny: The stride of rows.
* src: The data pointer of the current block.
* size_nx: The maximum offset of the current block is size_nx elements in the
* lowest dimension. The parameters are only calculated when isboundary = true.
* size_ny: The maximum offset of the current block is size_ny elements in the
* first dimension. The parameters are only calculated when isboundary = true.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
bool IsBoundary = false>
__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
int size_nx, int size_ny,
int stride_nx, int stride_ny) {
int thread_offset = threadIdx.x * NX;
int thread_offset = threadIdx.x;
int left_size_nx = size_nx - thread_offset;
// Each branch is added for better performance
......@@ -165,7 +164,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
if (IsBoundary) {
if (idy >= size_ny) {
if (idy * stride_ny >= size_ny) {
break;
}
}
......@@ -175,7 +174,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) {
if (idx >= left_size_nx) {
if (idx * stride_nx >= left_size_nx) {
break;
}
}
......@@ -185,14 +184,14 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) {
if (idx >= left_size_nx) {
if (idx * stride_nx >= left_size_nx) {
break;
}
}
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
if (IsBoundary) {
if (idy >= size_ny) {
if (idy * stride_ny >= size_ny) {
break;
}
}
......@@ -223,25 +222,24 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
}
/**
* @brief Read 2D data from global memory to registers. When IsBoundary = true
* @brief Read 1D data from global memory to register. When IsBoundary = true
* and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to
* improve memory access efficiency.
*
* @template paraments
* T: Data type of src and dst.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
* NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: The register pointer of the thread, the size is NX * NY.
* src: Data pointer of the current block.
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
......@@ -276,31 +274,29 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
}
/**
* @brief Read 2D data from global memory to registers for broadcast.
* @brief Read 2D data from global memory to registers with broadcast form.
*
* @template paraments
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
* NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: The register pointer of the thread, the size is NX * NY.
* src: Raw input data pointer of kernel.
* block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
* src: The original input data pointer of this kernel.
* block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX.
* config: Calculation configuration of broadcast. It is used to calculate the
* coordinate mapping relationship between output data and input data. Please
* refer to the sample code for specific usage.
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
* stride_nx: The stride of cols.
* stride_ny: The stride of rows.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename T, int NX, int NY, int BlockSize, int Rank,
bool IsBoundary = false>
......@@ -308,7 +304,7 @@ __device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset,
details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
int stride_ny) {
uint32_t thread_offset = block_offset + threadIdx.x * NX;
uint32_t thread_offset = block_offset + threadIdx.x;
uint32_t index_src = 0;
#pragma unroll
......@@ -334,37 +330,33 @@ __device__ __forceinline__ void ReadDataBc(
}
/**
* @brief Read 2D data from global memory to registers for reduce.
* @brief Read 2D data from global memory to register with reduce form.
*
* @template paraments
* T: The type of data stored in the global memory.
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
* NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: The register pointer of the thread, the size is NX * NY.
* src: Raw input data pointer of kernel.
* block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX;
* src: The input data pointer of this block.
* block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX.
* index_cal: Calculation configuration of Reduce. It is used to calculate the
* coordinate mapping relationship between output data and input data. Please
* refer to the sample code for specific usage.
* block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
* index_cal: get the global index in src, attention config was declared in
* host;
* coordinate mapping relationship between output data and input data.
* size_nx: The current block needs to load size_nx columns of data, this
* parameter will be used when IsBoundary = true.
* size_ny: The current block needs to load size_ny rows of data. This parameter
* parameter will participate in the calculation when isboundary = true.
* size_ny: The current block needs to load size_ny rows of data, this parameter
* will participate in the calculation when isboundary = true.
* will be used when IsBoundary = true.
* stride_nx: The stride of cols.
* stride_ny: The stride of rows.
* stride_nx: Each read one element stride stride_nx columns.
* stride_ny: Each read one element stride stride_ny raws.
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
*/
......@@ -375,10 +367,13 @@ __device__ __forceinline__ void ReadDataReduce(
const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
int stride_ny, bool reduce_last_dim) {
int thread_offset = 0;
int left_idx = 0;
if (reduce_last_dim) {
thread_offset = block_offset + threadIdx.x;
thread_offset = threadIdx.x;
left_idx = threadIdx.y;
} else {
thread_offset = block_offset + threadIdx.y;
thread_offset = threadIdx.y;
left_idx = threadIdx.x;
}
if (NX == 1) {
......@@ -389,30 +384,25 @@ __device__ __forceinline__ void ReadDataReduce(
break;
}
}
uint32_t index_src = index_cal(thread_offset);
uint32_t index_src = index_cal(thread_offset + block_offset);
dst[ny] = src[index_src];
thread_offset += stride_ny;
}
} else {
#pragma unroll
for (int nx = 0; nx < NX; ++nx) {
if (IsBoundary) {
if (nx * stride_nx >= size_nx) {
break;
}
}
#pragma unroll
for (int ny = 0; ny < NY; ++ny) {
if (IsBoundary) {
if (nx * stride_nx >= size_nx) {
if ((thread_offset >= size_ny) ||
(left_idx + nx * stride_nx >= size_nx)) {
break;
}
}
uint32_t index_src = index_cal(thread_offset);
uint32_t index_src = index_cal(thread_offset + block_offset);
dst[nx + ny * NX] = src[index_src];
thread_offset += stride_ny;
}
thread_offset += stride_nx;
}
}
}
......@@ -424,20 +414,19 @@ __device__ __forceinline__ void ReadDataReduce(
*
* @template paraments
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index, and for xpu, core_id() is used as
* the index. Currently only GPU was supported.
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim, boundary judgment is required to avoid memory access
* NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: Data pointer of the current block.
* src: The register pointer of the thread, the size is NX * NY.
* size: The current block needs to load size data continuously.
* dst: The data pointer of the current block.
* src: The register pointer, the size is NX * NY.
* size: The current block needs to load size elements continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
......@@ -467,6 +456,165 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
}
}
/**
* @brief Write 2D data from register to global memory according to Tx type, and
* store it as Ty type.
*
* @template paraments
* Tx: The type of data that needs to be stored in registers.
* Ty: The type of data that stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: The data pointer of the current block.
* src: The register pointer of the thread, the size is NX * NY.
* size_nx: The maximum offset of the current block is size_nx elements in the
* lowest dimension. The parameters are only calculated when isboundary = true.
* size_ny: The maximum offset of the current block is size_ny elements in the
* first dimension. The parameters are only calculated when isboundary = true.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
bool IsBoundary = false>
__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src,
int size_nx, int size_ny,
int stride_nx, int stride_ny) {
int thread_offset = threadIdx.x;
int left_size_nx = size_nx - thread_offset;
// Each branch is added for better performance
if (NX == 1 && NY == 1) { // for NX == 1 and NY == 1
if (IsBoundary) {
if (left_size_nx > 0) {
dst[thread_offset] = static_cast<Ty>(src[0]);
}
} else {
dst[thread_offset] = static_cast<Ty>(src[0]);
}
} else if (NX == 1) { // for NX == 1 and NY != 1
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
if (IsBoundary) {
if (idy * stride_ny >= size_ny) {
break;
}
}
dst[thread_offset + idy * stride_ny] = static_cast<Ty>(src[idy]);
}
} else if (NY == 1) { // for NY == 1 and NX != 1
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) {
if (idx * stride_nx >= left_size_nx) {
break;
}
}
dst[thread_offset + idx * stride_nx] = static_cast<Ty>(src[idx]);
}
} else { // for NX != 1 and NY != 1
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) {
if (idx * stride_nx >= left_size_nx) {
break;
}
}
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
if (IsBoundary) {
if (idy * stride_ny >= size_ny) {
break;
}
}
dst[thread_offset + idx * stride_nx + idy * stride_ny] =
static_cast<Ty>(src[idy * NX + idx]);
}
}
}
}
/**
* @brief Initialize register with init_data.
*
* @template paraments
* T: Data type of register.
* NX: Number of data to initialize.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
template <typename T, int NX, bool IsBoundary = false>
__device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
#pragma unroll
for (int i = 0; i < NX; i++) {
if (IsBoundary) {
if (i >= num) {
break;
}
}
dst[i] = init_data[i];
}
}
/**
* @brief Read 1D data from global memory to register with broadcast form.
*
* @template paraments
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
* NX x NY x blockDim.x, boundary judgment is required to avoid memory access
* crossing the boundary.
*
* @param:
* dst: The register pointer of the thread, the size is NX * NY.
* src: The original input data pointer of kernel.
* block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX;
* config: Calculation configuration of broadcast. It is used to calculate the
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template <typename T, int NX, int NY, int BlockSize, int Rank,
bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset,
details::BroadcastConfig<Rank> config, int total_num_output) {
uint32_t thread_offset = block_offset + threadIdx.x * NX;
uint32_t index_src = 0;
#pragma unroll
for (uint32_t nx = 0; nx < NX; ++nx) {
uint32_t index_output = thread_offset + nx;
index_src = 0;
if (IsBoundary) {
if (index_output >= total_num_output) {
break;
}
}
#pragma unroll
for (int i = 0; i < Rank; ++i) {
auto fast_divmoder = config.divmoders[i].Divmod(index_output);
index_output = fast_divmoder.val[0];
index_src += fast_divmoder.val[1] * config.strides[i];
}
dst[nx] = src[index_src];
}
}
} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
......@@ -529,6 +529,31 @@ __device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer,
kps::WriteData<Ty, 1, 1, 1, IsBoundary>(y + store_offset, &temp_data, size);
}
template <typename Tx, typename MPType, typename ReduceOp, typename TransformOp,
typename Calculator, bool IsBoundary>
__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var,
ReduceOp reducer, TransformOp transformer,
MPType init, int reduce_num, int input_idx,
bool reduce_last_dim,
const Calculator& reduce_index_calculator,
int stride, int num) {
Tx input_reg[REDUCE_VEC_SIZE];
MPType input_compute[REDUCE_VEC_SIZE];
MPType input_transform[REDUCE_VEC_SIZE];
kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator, IsBoundary>(
&input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num,
1, stride, reduce_last_dim);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
&input_transform[0], &input_reg[0], transformer);
kps::Init<MPType, REDUCE_VEC_SIZE, IsBoundary>(input_compute, input_transform,
num);
kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
reduce_var, &input_compute[0], reducer, reduce_last_dim);
}
// when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or
// when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this
// function will be used
......@@ -570,37 +595,17 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer,
// 1. reduce for each thread
if (left_idx < left_num) {
// load REDUCE_VEC_SIZE data once, and then compute
Tx input_reg[REDUCE_VEC_SIZE];
MPType input_compute[REDUCE_VEC_SIZE];
int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride;
for (; input_idx + block_size < bound;
input_idx += REDUCE_VEC_SIZE * stride) {
kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator>(
&input_reg[0], input, input_idx, reduce_index_calculator, 1,
reduce_num, 1, stride, reduce_last_dim);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
&input_compute[0], &input_reg[0], transformer);
kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &input_compute[0], reducer, reduce_last_dim);
}
kps::Init<MPType, REDUCE_VEC_SIZE>(&input_compute[0], init);
kps::ReadDataReduce<Tx, 1, REDUCE_VEC_SIZE, 1, 1, Calculator, true>(
&input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num,
1, stride, reduce_last_dim);
input_idx += tid;
#pragma unroll
for (int i = 0; i < REDUCE_VEC_SIZE; ++i) {
if (input_idx >= reduce_num) {
break;
}
input_compute[i] = static_cast<MPType>(transformer(input_reg[i]));
input_idx += stride;
}
kps::Reduce<MPType, REDUCE_VEC_SIZE, 1, 1, ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &input_compute[0], reducer, reduce_last_dim);
ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, false>(
input, &reduce_var, reducer, transformer, init, reduce_num, input_idx,
reduce_last_dim, reduce_index_calculator, stride, reduce_num);
}
int num = (reduce_num - input_idx - tid + stride - 1) / stride;
ReduceAnyKernelImpl<Tx, MPType, ReduceOp, TransformOp, Calculator, true>(
input, &reduce_var, reducer, transformer, init, reduce_num - input_idx,
input_idx, reduce_last_dim, reduce_index_calculator, stride, num);
}
kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册