diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 53ac85802a6f43f1fb3d4788fd5fc9b993340ddc..549a6be0b4507eb24e27e0a5a17bc0bd5a758787 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -171,7 +171,7 @@ __device__ __forceinline__ void LoadData( // num: how many data will be deal with in this time if (need_broadcast) { kps::ReadDataBc(dst, src, block_offset, - config, numel, 1, 1); + config, numel); } else { kps::ReadData(dst, src + block_offset, num); } diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index a8bd35a1b7309a524d1e1ae786c7cbe49fd2eab8..27b903ff62954b03c88a09b1ae4b067ca74f8780 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -76,14 +76,14 @@ __global__ void BroadcastKernelBinary( // load in0 if (use_broadcast[0]) { kernel_primitives::ReadDataBc( - arg0, in0, fix, configlists[0], numel, 1, 1); + arg0, in0, fix, configlists[0], numel); } else { kernel_primitives::ReadData(arg0, in0 + fix, num); } // load in1 if (use_broadcast[1]) { kernel_primitives::ReadDataBc( - arg1, in1, fix, configlists[1], numel, 1, 1); + arg1, in1, fix, configlists[1], numel); } else { kernel_primitives::ReadData(arg1, in1 + fix, num); } diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h index 0c751a939b2147754c439f26d89df45d24f1d390..85d526536df81e7adf018a5cd14c82a70be84bb6 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h @@ -135,17 +135,16 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { } // namespace details /** - * @brief Perform unary calculation according to OpFunc. Size of input and + * @brief Perform unary calculation according to OpFunc. Shape of input and * output are the same. * * @template paraments - * InT: Data type of in. - * OutT: Data type of out. + * InT: The data type of in. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following: * template * struct XxxFunctor { @@ -170,21 +169,20 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, } /** - * @brief Binary calculation according to OpFunc. Size of The input and output + * @brief Binary calculation according to OpFunc. Shape of The input and output * are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. - * NX: The number of data columns loaded by each thread. - * NY: The number of data rows loaded by each thread. + * InT: The data type of in1 and in2. + * OutT: The data type of out. + * NX: The number of data columns computed by each thread. + * NY: The number of data rows computed by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following: - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT& a, const InT& b) const { + * HOSTDEVICE InT operator()(const InT& a, const InT& b) const { * return ...; * } * }; @@ -193,7 +191,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, * out: The register pointer of out, the size is NX * NY. * in1: The register pointer of fist input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY. - * compute: Compute function which was declared like OpFunc(). + * compute: Compute function which was declared like OpFunc(). */ template @@ -207,21 +205,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, } /** - * @brief Ternary calculation according to OpFunc. Size of input and output + * @brief Ternary calculation according to OpFunc. Shape of input and output * are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1 and in2. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c) + * HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c) * const { * return ...; * } @@ -232,7 +229,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, * in1: The register pointer of fist input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY. * in3: The register pointer of third input, size is NX * NY. - * compute: Compute function which was declared like OpFunc(). + * compute: Compute function which was declared like OpFunc(). */ template @@ -247,30 +244,29 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, } /** - * @brief Multivariate calculation according to OpFunc. Size of input and output - * are the same. + * @brief Multivariate calculation according to OpFunc. Shape of inputs and + * output are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1, in2 and in3. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. - * Arity: The size of ins + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * Arity: The size of ins. * OpFunc: Compute functor which has an operator() as following: - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT* args) const { + * HOSTDEVICE InT operator()(const InT* args) const { * return ...; * } * }; * * @param * out: The register pointer of out, the size is NX * NY. - * ins: An array of pointers consisting of multiple inputs. - * compute: Compute function which was declared like OpFunc(). + * ins: A pointers of array consisting of multiple inputs. + * compute: Compute function which was declared like OpFunc(). */ template @@ -293,13 +289,12 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], * shape is [NY, NX]. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1 and in2. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following * template * struct XxxFunctor { @@ -339,8 +334,7 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const T* in1, * NX: The number of data continuously loaded by each thread. * NY: The number of data rows loaded by each thread, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * ReduceFunctor: Compute functor which has an operator() as following * template * struct ReduceFunctor { diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h index c720bedf0a3afcfd903af7f3bf0c97791774e5ec..860072bd0c52ec5c04f52525a7d154d6576a2e43 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h @@ -118,8 +118,8 @@ struct BroadcastConfig { } // namespace details /** - * @brief Read 2D data from global memory to registers according to Tx type, and - * store it as Ty type. + * @brief Read 2D data from global memory to register according to Tx type, and + * store it as Ty type into register. * * @template paraments * Tx: The type of data stored in the global memory. @@ -127,8 +127,7 @@ struct BroadcastConfig { * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than * NX x NY x blockDim, boundary judgment is required to avoid memory access @@ -136,20 +135,20 @@ struct BroadcastConfig { * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Data pointer of the current block. - * size_nx: The current block needs to load size_nx columns of data, this - * parameter will be used when IsBoundary = true. - * size_ny: The current block needs to load size_ny rows of data. This parameter - * will be used when IsBoundary = true. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * src: The data pointer of the current block. + * size_nx: The maximum offset of the current block is size_nx elements in the + * lowest dimension. The parameters are only calculated when isboundary = true. + * size_ny: The maximum offset of the current block is size_ny elements in the + * first dimension. The parameters are only calculated when isboundary = true. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. */ template __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, int size_nx, int size_ny, int stride_nx, int stride_ny) { - int thread_offset = threadIdx.x * NX; + int thread_offset = threadIdx.x; int left_size_nx = size_nx - thread_offset; // Each branch is added for better performance @@ -165,7 +164,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idy = 0; idy < NY; ++idy) { if (IsBoundary) { - if (idy >= size_ny) { + if (idy * stride_ny >= size_ny) { break; } } @@ -175,7 +174,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idx = 0; idx < NX; ++idx) { if (IsBoundary) { - if (idx >= left_size_nx) { + if (idx * stride_nx >= left_size_nx) { break; } } @@ -185,14 +184,14 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idx = 0; idx < NX; ++idx) { if (IsBoundary) { - if (idx >= left_size_nx) { + if (idx * stride_nx >= left_size_nx) { break; } } #pragma unroll for (int idy = 0; idy < NY; ++idy) { if (IsBoundary) { - if (idy >= size_ny) { + if (idy * stride_ny >= size_ny) { break; } } @@ -223,25 +222,24 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { } /** - * @brief Read 2D data from global memory to registers. When IsBoundary = true + * @brief Read 1D data from global memory to register. When IsBoundary = true * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to * improve memory access efficiency. * * @template paraments - * T: Data type of src and dst. - * NX: The number of data continuously loaded by each thread. - * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * T: The type of data. + * NX: Each thread load NX data from global memory continuously. + * NY: Each thread need to load NY rows, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. * When the number of data processed by this block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Data pointer of the current block. + * src: The data pointer of the current block. * size: The current block needs to load size data continuously. */ template @@ -276,31 +274,29 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, } /** - * @brief Read 2D data from global memory to registers for broadcast. + * @brief Read 2D data from global memory to registers with broadcast form. * * @template paraments * T: The type of data stored in the global memory. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Raw input data pointer of kernel. - * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * src: The original input data pointer of this kernel. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX. * config: Calculation configuration of broadcast. It is used to calculate the - * coordinate mapping relationship between output data and input data. Please - * refer to the sample code for specific usage. + * coordinate mapping relationship between output data and input data. * total_num_output: Total number of original output. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. */ template @@ -308,7 +304,7 @@ __device__ __forceinline__ void ReadDataBc( T* dst, const T* __restrict__ src, uint32_t block_offset, details::BroadcastConfig config, int total_num_output, int stride_nx, int stride_ny) { - uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t thread_offset = block_offset + threadIdx.x; uint32_t index_src = 0; #pragma unroll @@ -334,37 +330,33 @@ __device__ __forceinline__ void ReadDataBc( } /** - * @brief Read 2D data from global memory to registers for reduce. + * @brief Read 2D data from global memory to register with reduce form. * * @template paraments - * T: The type of data stored in the global memory. + * T: The type of data. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Raw input data pointer of kernel. - * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * src: The input data pointer of this block. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX. * index_cal: Calculation configuration of Reduce. It is used to calculate the - * coordinate mapping relationship between output data and input data. Please - * refer to the sample code for specific usage. - * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX; - * index_cal: get the global index in src, attention config was declared in - * host; + * coordinate mapping relationship between output data and input data. * size_nx: The current block needs to load size_nx columns of data, this - * parameter will be used when IsBoundary = true. - * size_ny: The current block needs to load size_ny rows of data. This parameter + * parameter will participate in the calculation when isboundary = true. + * size_ny: The current block needs to load size_ny rows of data, this parameter + * will participate in the calculation when isboundary = true. * will be used when IsBoundary = true. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * stride_nx: Each read one element stride stride_nx columns. + * stride_ny: Each read one element stride stride_ny raws. * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ @@ -375,10 +367,13 @@ __device__ __forceinline__ void ReadDataReduce( const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, int stride_ny, bool reduce_last_dim) { int thread_offset = 0; + int left_idx = 0; if (reduce_last_dim) { - thread_offset = block_offset + threadIdx.x; + thread_offset = threadIdx.x; + left_idx = threadIdx.y; } else { - thread_offset = block_offset + threadIdx.y; + thread_offset = threadIdx.y; + left_idx = threadIdx.x; } if (NX == 1) { @@ -389,30 +384,25 @@ __device__ __forceinline__ void ReadDataReduce( break; } } - uint32_t index_src = index_cal(thread_offset); + uint32_t index_src = index_cal(thread_offset + block_offset); dst[ny] = src[index_src]; thread_offset += stride_ny; } } else { #pragma unroll for (int nx = 0; nx < NX; ++nx) { - if (IsBoundary) { - if (nx * stride_nx >= size_nx) { - break; - } - } #pragma unroll for (int ny = 0; ny < NY; ++ny) { if (IsBoundary) { - if (nx * stride_nx >= size_nx) { + if ((thread_offset >= size_ny) || + (left_idx + nx * stride_nx >= size_nx)) { break; } } - uint32_t index_src = index_cal(thread_offset); + uint32_t index_src = index_cal(thread_offset + block_offset); dst[nx + ny * NX] = src[index_src]; thread_offset += stride_ny; } - thread_offset += stride_nx; } } } @@ -424,20 +414,19 @@ __device__ __forceinline__ void ReadDataReduce( * * @template paraments * T: The type of data. - * NX: The number of data continuously loaded by each thread. + * NX: The number of data continuously writed by each thread. * NY: The number of data rows loaded by each thread, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: - * dst: Data pointer of the current block. - * src: The register pointer of the thread, the size is NX * NY. - * size: The current block needs to load size data continuously. + * dst: The data pointer of the current block. + * src: The register pointer, the size is NX * NY. + * size: The current block needs to load size elements continuously. */ template __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, @@ -467,6 +456,165 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, } } +/** + * @brief Write 2D data from register to global memory according to Tx type, and + * store it as Ty type. + * + * @template paraments + * Tx: The type of data that needs to be stored in registers. + * Ty: The type of data that stored in the global memory. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The data pointer of the current block. + * src: The register pointer of the thread, the size is NX * NY. + * size_nx: The maximum offset of the current block is size_nx elements in the + * lowest dimension. The parameters are only calculated when isboundary = true. + * size_ny: The maximum offset of the current block is size_ny elements in the + * first dimension. The parameters are only calculated when isboundary = true. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. + */ +template +__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src, + int size_nx, int size_ny, + int stride_nx, int stride_ny) { + int thread_offset = threadIdx.x; + int left_size_nx = size_nx - thread_offset; + + // Each branch is added for better performance + if (NX == 1 && NY == 1) { // for NX == 1 and NY == 1 + if (IsBoundary) { + if (left_size_nx > 0) { + dst[thread_offset] = static_cast(src[0]); + } + } else { + dst[thread_offset] = static_cast(src[0]); + } + } else if (NX == 1) { // for NX == 1 and NY != 1 +#pragma unroll + for (int idy = 0; idy < NY; ++idy) { + if (IsBoundary) { + if (idy * stride_ny >= size_ny) { + break; + } + } + dst[thread_offset + idy * stride_ny] = static_cast(src[idy]); + } + } else if (NY == 1) { // for NY == 1 and NX != 1 +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (IsBoundary) { + if (idx * stride_nx >= left_size_nx) { + break; + } + } + dst[thread_offset + idx * stride_nx] = static_cast(src[idx]); + } + } else { // for NX != 1 and NY != 1 +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (IsBoundary) { + if (idx * stride_nx >= left_size_nx) { + break; + } + } +#pragma unroll + for (int idy = 0; idy < NY; ++idy) { + if (IsBoundary) { + if (idy * stride_ny >= size_ny) { + break; + } + } + dst[thread_offset + idx * stride_nx + idy * stride_ny] = + static_cast(src[idy * NX + idx]); + } + } + } +} + +/** + * @brief Initialize register with init_data. + * + * @template paraments + * T: Data type of register. + * NX: Number of data to initialize. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * init_data: The register pointer of init data, the size is NX. + */ +template +__device__ __forceinline__ void Init(T* dst, T* init_data, int num) { +#pragma unroll + for (int i = 0; i < NX; i++) { + if (IsBoundary) { + if (i >= num) { + break; + } + } + dst[i] = init_data[i]; + } +} + +/** + * @brief Read 1D data from global memory to register with broadcast form. + * + * @template paraments + * T: The type of data stored in the global memory. + * NX: The number of data continuously loaded by each thread. + * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: The original input data pointer of kernel. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX; + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * total_num_output: Total number of original output. + */ +template +__device__ __forceinline__ void ReadDataBc( + T* dst, const T* __restrict__ src, uint32_t block_offset, + details::BroadcastConfig config, int total_num_output) { + uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t index_src = 0; + +#pragma unroll + for (uint32_t nx = 0; nx < NX; ++nx) { + uint32_t index_output = thread_offset + nx; + index_src = 0; + if (IsBoundary) { + if (index_output >= total_num_output) { + break; + } + } +#pragma unroll + for (int i = 0; i < Rank; ++i) { + auto fast_divmoder = config.divmoders[i].Divmod(index_output); + index_output = fast_divmoder.val[0]; + index_src += fast_divmoder.val[1] * config.strides[i]; + } + dst[nx] = src[index_src]; + } +} + } // namespace kernel_primitives } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 28b6ebc2433224ac6743b35df5e16c6f4b9402cd..bf451272a47b0a9a43b2f9a04dfc5e19877d9299 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -529,6 +529,31 @@ __device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer, kps::WriteData(y + store_offset, &temp_data, size); } +template +__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var, + ReduceOp reducer, TransformOp transformer, + MPType init, int reduce_num, int input_idx, + bool reduce_last_dim, + const Calculator& reduce_index_calculator, + int stride, int num) { + Tx input_reg[REDUCE_VEC_SIZE]; + MPType input_compute[REDUCE_VEC_SIZE]; + MPType input_transform[REDUCE_VEC_SIZE]; + + kps::Init(&input_compute[0], init); + kps::ReadDataReduce( + &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, + 1, stride, reduce_last_dim); + kps::ElementwiseUnary( + &input_transform[0], &input_reg[0], transformer); + kps::Init(input_compute, input_transform, + num); + kps::Reduce( + reduce_var, &input_compute[0], reducer, reduce_last_dim); +} + // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // function will be used @@ -570,37 +595,17 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer, // 1. reduce for each thread if (left_idx < left_num) { // load REDUCE_VEC_SIZE data once, and then compute - Tx input_reg[REDUCE_VEC_SIZE]; - MPType input_compute[REDUCE_VEC_SIZE]; int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; for (; input_idx + block_size < bound; input_idx += REDUCE_VEC_SIZE * stride) { - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, - reduce_num, 1, stride, reduce_last_dim); - kps::ElementwiseUnary( - &input_compute[0], &input_reg[0], transformer); - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); - } - - kps::Init(&input_compute[0], init); - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, - 1, stride, reduce_last_dim); - input_idx += tid; -#pragma unroll - for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { - if (input_idx >= reduce_num) { - break; - } - input_compute[i] = static_cast(transformer(input_reg[i])); - input_idx += stride; + ReduceAnyKernelImpl( + input, &reduce_var, reducer, transformer, init, reduce_num, input_idx, + reduce_last_dim, reduce_index_calculator, stride, reduce_num); } - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); + int num = (reduce_num - input_idx - tid + stride - 1) / stride; + ReduceAnyKernelImpl( + input, &reduce_var, reducer, transformer, init, reduce_num - input_idx, + input_idx, reduce_last_dim, reduce_index_calculator, stride, num); } kps::Reduce(