未验证 提交 82b33be3 编写于 作者: N niuliling123 提交者: GitHub

Modify the reduce op according to the kernel primitive api (#35282)

上级 7aa4d879
......@@ -202,9 +202,9 @@ void SetConfigForColumnReduce(const int max_threads, const int reduce_num,
int num_block = (max_threads / left_num);
if (num_block > 1 && reduce_num >= REDUCE_SPLIT_BOUNDARY) {
*blocking_size = detail::GetLastPow2(reduce_num / num_block);
*blocking_size = details::GetLastPow2(reduce_num / num_block);
if (*blocking_size <= 1) {
*blocking_size = detail::GetLastPow2(sqrt(reduce_num));
*blocking_size = details::GetLastPow2(sqrt(reduce_num));
} else if (*blocking_size * 2 < reduce_num) {
*blocking_size *= 2;
}
......
......@@ -31,13 +31,15 @@ namespace kernel_primitives {
namespace details {
#ifdef __HIPCC__
constexpr int kMaxThread = 256;
constexpr int kReduceMaxThread = 256;
constexpr int kWarpSize = 64;
#else
constexpr int kMaxThread = 128;
constexpr int kReduceMaxThread = 128;
constexpr int kWarpSize = 32;
#endif
// kGlobalMode: block reduce, each block gets an output;
// kLocalMode: thread reduce, each thread gets an output;
enum ReduceMode { kGlobalMode, kLocalMode };
template <typename T>
......@@ -118,7 +120,7 @@ __device__ __forceinline__ T BlockXReduce(T val, ReduceOp reducer) {
*/
template <typename T, typename ReduceOp>
__device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
__shared__ T shared_memory[details::kMaxThread];
__shared__ T shared_memory[details::kReduceMaxThread];
shared_memory[SharedMemoryIndex(0)] = val;
for (int stride = blockDim.y / 2; stride > 0; stride >>= 1) {
__syncthreads();
......
......@@ -124,36 +124,36 @@ struct BroadcastConfig {
template <typename Tx, typename Ty, int NX, int NY, int BlockSize>
__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
int stride_nx, int stride_ny) {
int thread_offset = threadIdx.x * NX;
if (NY == 1 && NX == 1) {
dst[0] = static_cast<Ty>(src[threadIdx.x]);
dst[0] = static_cast<Ty>(src[thread_offset]);
} else if (NX == 1) {
int dx = threadIdx.x;
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
dst[idy] = static_cast<Ty>(src[dx + idy * stride_ny]);
dst[idy] = static_cast<Ty>(src[thread_offset + idy * stride_ny]);
}
} else if (NY == 1) {
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
dst[idx] = static_cast<Ty>(src[idx * stride_nx]);
dst[idx] = static_cast<Ty>(src[thread_offset + idx * stride_nx]);
}
} else {
int dx = threadIdx.x * NX;
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
#pragma unroll
for (int idy = 0; idy < NY; ++idy) {
dst[idy * NX + idx] =
static_cast<Ty>(src[idx * stride_nx + dx + idy * stride_ny]);
dst[idy * NX + idx] = static_cast<Ty>(
src[thread_offset + idx * stride_nx + idy * stride_ny]);
}
}
}
}
/**
* @brief load data from src to dst, src can be 1D data or 2D data. When
* boundary judgment is required, you need to set a to true, and a is false by
* default.
* @brief load data from src to dst with stride, src can be 1D data or 2D data.
* When boundary judgment is required, you need to set a to true, and a is false
* by default.
* @typename:
* Tx: data type of src
* Ty: data type of dstt
......@@ -172,17 +172,17 @@ template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
int size_nx, int size_ny,
int stride_nx, int stride_ny) {
int dx = threadIdx.x * NX;
int size = size_nx - dx;
int thread_offset = threadIdx.x * NX;
int left_size_nx = size_nx - thread_offset;
// Each branch is added for better performance
if (NX == 1 && NY == 1) { // for NX == 1 and NY == 1
if (IsBoundary) {
if (dx < size_nx) {
dst[0] = static_cast<Ty>(src[dx]);
if (left_size_nx > 0) {
dst[0] = static_cast<Ty>(src[thread_offset]);
}
} else {
dst[0] = static_cast<Ty>(src[dx]);
dst[0] = static_cast<Ty>(src[thread_offset]);
}
} else if (NX == 1) { // for NX == 1 and NY != 1
#pragma unroll
......@@ -192,23 +192,23 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
break;
}
}
dst[idy] = static_cast<Ty>(src[dx + idy * stride_ny]);
dst[idy] = static_cast<Ty>(src[thread_offset + idy * stride_ny]);
}
} else if (NY == 1) { // for NY == 1 and NX != 1
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) {
if (idx >= size) {
if (idx >= left_size_nx) {
break;
}
}
dst[idx] = static_cast<Ty>(src[idx * stride_nx + dx]);
dst[idx] = static_cast<Ty>(src[thread_offset + idx * stride_nx]);
}
} else { // for NX != 1 and NY != 1
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (IsBoundary) {
if (idx >= size) {
if (idx >= left_size_nx) {
break;
}
}
......@@ -219,8 +219,8 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
break;
}
}
dst[idy * NX + idx] =
static_cast<Ty>(src[idx * stride_nx + dx + idy * stride_ny]);
dst[idy * NX + idx] = static_cast<Ty>(
src[thread_offset + idx * stride_nx + idy * stride_ny]);
}
}
}
......@@ -251,17 +251,17 @@ template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
int num) {
if (IsBoundary) { // blockDim.x * NX > num
int dx = threadIdx.x * NX;
int thread_offset = threadIdx.x * NX;
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if (idx + dx < num) {
dst[idx] = src[idx + dx];
if (idx + thread_offset < num) {
dst[idx] = src[thread_offset + idx];
}
}
} else { // blockDim,x * NX < num
const int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
const int kVectorsPerThread = NX / kVectorSize;
int tid = threadIdx.x * kVectorsPerThread;
int thread_offset = threadIdx.x * kVectorsPerThread;
using VecType = details::VectorType<T, kVectorSize>;
const VecType* vec_input = reinterpret_cast<const VecType*>(src);
......@@ -269,7 +269,7 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
#pragma unroll
for (int i = 0; i < kVectorsPerThread; ++i) {
vec_temp[i] = vec_input[i + tid];
vec_temp[i] = vec_input[thread_offset + i];
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
dst[idx] = *(reinterpret_cast<T*>(vec_temp) + idx);
......@@ -289,39 +289,39 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
* is 2
* IsBoundary: whether to make boundary judgment
* @param:
* fix: data offset of this block, blockDim.x * blockIdx.x * NX;
* block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
* config: get the global index in src, attention config was declared in host;
* num: the num of out
* total_num_output: total num of output
* stride_nx: the stride of cols
* stride_ny: the stride of rows
*/
template <typename T, int NX, int NY, int BlockSize, int ShapeSize,
bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t fix,
details::BroadcastConfig<ShapeSize> config, int num, int stride_nx,
int stride_ny) {
uint32_t base_offset = fix + threadIdx.x * NX;
uint32_t offset = 0;
T* dst, const T* __restrict__ src, uint32_t block_offset,
details::BroadcastConfig<ShapeSize> config, int total_num_output,
int stride_nx, int stride_ny) {
uint32_t thread_offset = block_offset + threadIdx.x * NX;
uint32_t index_src = 0;
#pragma unroll
for (int ny = 0; ny < NY; ++ny) {
#pragma unroll
for (uint32_t nx = 0; nx < NX; ++nx) {
uint32_t idx = base_offset + ny * stride_ny + nx * stride_nx;
uint32_t index_output = thread_offset + ny * stride_ny + nx * stride_nx;
index_src = 0;
if (IsBoundary) {
if (idx >= num) {
if (index_output >= total_num_output) {
break;
}
}
offset = 0;
#pragma unroll
for (int i = 0; i < ShapeSize; ++i) {
auto fast_divmoder = config.divmoders[i].Divmod(idx);
idx = fast_divmoder.val[0];
offset += fast_divmoder.val[1] * config.strides[i];
auto fast_divmoder = config.divmoders[i].Divmod(index_output);
index_output = fast_divmoder.val[0];
index_src += fast_divmoder.val[1] * config.strides[i];
}
dst[nx + ny * NX] = src[offset];
dst[nx + ny * NX] = src[index_src];
}
}
}
......@@ -338,7 +338,7 @@ __device__ __forceinline__ void ReadDataBc(
* IndexCal: get the global index in src, attention config was declared in host;
* IsBoundary: whether to make boundary judgment
* @param:
* fix: data offset of this block, blockDim.x * blockIdx.x * NX;
* block_offset: data offset of this block, blockDim.x * blockIdx.x * NX;
* index_cal: get the global index in src, attention config was declared in
* host;
* size_nx: number of columns to be processed by the current block
......@@ -350,27 +350,27 @@ __device__ __forceinline__ void ReadDataBc(
template <typename T, int NX, int NY, int BlockSize, int ShapeSize,
typename IndexCal, bool IsBoundary = false>
__device__ __forceinline__ void ReadDataReduce(
T* dst, const T* __restrict__ src, int fix, const IndexCal& index_cal,
int size_nx, int size_ny, int stride_nx, int stride_ny,
bool reduce_last_dim) {
int base_offset = fix;
T* dst, const T* __restrict__ src, int block_offset,
const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
int stride_ny, bool reduce_last_dim) {
int thread_offset = 0;
if (reduce_last_dim) {
base_offset += threadIdx.x;
thread_offset = block_offset + threadIdx.x;
} else {
base_offset += threadIdx.y;
thread_offset = block_offset + threadIdx.y;
}
if (NX == 1) {
#pragma unroll
for (int ny = 0; ny < NY; ++ny) {
if (IsBoundary) {
if (base_offset >= size_ny) {
if (thread_offset >= size_ny) {
break;
}
}
uint32_t offset = index_cal(base_offset);
dst[ny] = src[offset];
base_offset += stride_ny;
uint32_t index_src = index_cal(thread_offset);
dst[ny] = src[index_src];
thread_offset += stride_ny;
}
} else {
#pragma unroll
......@@ -387,15 +387,16 @@ __device__ __forceinline__ void ReadDataReduce(
break;
}
}
uint32_t offset = index_cal(base_offset);
dst[nx + ny * NX] = src[offset];
base_offset += stride_ny;
uint32_t index_src = index_cal(thread_offset);
dst[nx + ny * NX] = src[index_src];
thread_offset += stride_ny;
}
thread_offset += stride_nx;
}
}
}
/** @brief: WriteData
/**
* @brief store data from src to dst, src can be 1D data, you should set NY = 1.
* When boundary judgment is required, you need to set a to true, and a is false
* by default.
......@@ -412,11 +413,11 @@ template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
int num) {
if (IsBoundary) {
int dx = threadIdx.x * NX;
int thread_offset = threadIdx.x * NX;
#pragma unroll
for (int idx = 0; idx < NX; ++idx) {
if ((idx + dx) < num) {
dst[idx + dx] = src[idx];
if ((thread_offset + idx) < num) {
dst[thread_offset + idx] = src[idx];
}
}
} else {
......@@ -424,14 +425,14 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
const int kVectorSize = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1;
const int kVectorsPerThread = NX / kVectorSize;
int dx = threadIdx.x * kVectorsPerThread;
int thread_offset = threadIdx.x * kVectorsPerThread;
using VecType = details::VectorType<T, kVectorSize>;
VecType* vec_dst = reinterpret_cast<VecType*>(dst);
VecType vec_temp[kVectorsPerThread];
#pragma unroll
for (int idx = 0; idx < kVectorsPerThread; ++idx) {
vec_temp[idx] = *(reinterpret_cast<VecType*>(src) + idx);
vec_dst[dx + idx] = vec_temp[idx];
vec_dst[thread_offset + idx] = vec_temp[idx];
}
}
}
......
......@@ -32,7 +32,6 @@ static __device__ __forceinline__ platform::float16 LogFunctor(
static __device__ __forceinline__ float LogFunctor(float x) { return logf(x); }
static __device__ __forceinline__ double LogFunctor(double x) { return log(x); }
} // namespace details
/*************************** Compute Functor****************************/
// for margin_cross_entropy
template <typename Tx, typename Ty = Tx>
......@@ -75,6 +74,7 @@ struct DivideFunctor {
T n_inv;
};
} // namespace details
} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
......@@ -128,39 +128,9 @@ __global__ void AddMarginToPositiveLogitsKernel(
}
}
static __device__ __forceinline__ platform::float16 exp_on_device(
platform::float16 x) {
return ::Eigen::numext::exp(x);
}
static __device__ __forceinline__ float exp_on_device(float x) {
return expf(x);
}
static __device__ __forceinline__ double exp_on_device(double x) {
return exp(x);
}
static __device__ __forceinline__ platform::float16 log_on_device(
platform::float16 x) {
return ::Eigen::numext::log(x);
}
static __device__ __forceinline__ float log_on_device(float x) {
return logf(x);
}
static __device__ __forceinline__ double log_on_device(double x) {
return log(x);
}
template <typename Tx, typename Ty = Tx>
struct ExpLogitTransformer {
HOSTDEVICE explicit inline ExpLogitTransformer(int n) {}
HOSTDEVICE inline Ty operator()(const Tx& x) const {
return static_cast<Ty>(exp_on_device(x));
}
};
template <typename Tx, typename Ty = Tx>
struct ExpAndSum {
using Transformer = ExpLogitTransformer<Tx>;
using Transformer = kpds::ExpLogitTransformer<Tx>;
inline Ty initial() { return static_cast<Ty>(0.0f); }
......@@ -189,7 +159,7 @@ __global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row,
const int64_t N, const int64_t D) {
CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D;
logits[i] -= log_on_device(logits_sum_per_row[row]);
logits[i] -= kpds::LogFunctor(logits_sum_per_row[row]);
}
}
......@@ -204,9 +174,9 @@ __global__ void HardLabelSoftmaxWithCrossEntropyKernel(
if ((col + start_index) == labels[row]) {
auto softmax = log_softmax[i];
loss[row] = -softmax;
log_softmax[i] = exp_on_device(softmax);
log_softmax[i] = kpds::ExpFunctor(softmax);
} else {
log_softmax[i] = exp_on_device(log_softmax[i]);
log_softmax[i] = kpds::ExpFunctor(log_softmax[i]);
}
}
}
......
......@@ -24,9 +24,11 @@ limitations under the License. */
namespace paddle {
namespace operators {
namespace kpds = paddle::operators::kernel_primitives::details;
template <typename Tx, typename Ty = Tx>
struct CustomMin {
using Transformer = detail::IdentityFunctor<Tx>;
using Transformer = kpds::IdentityFunctor<Tx>;
inline Ty initial() {
return static_cast<Ty>(std::numeric_limits<Ty>::max());
......@@ -39,7 +41,7 @@ struct CustomMin {
template <typename Tx, typename Ty = Tx>
struct CustomMax {
using Transformer = detail::IdentityFunctor<Tx>;
using Transformer = kpds::IdentityFunctor<Tx>;
inline Ty initial() {
return static_cast<Ty>(std::numeric_limits<Ty>::lowest());
......@@ -53,7 +55,7 @@ struct CustomMax {
// for cub::Reduce
template <typename Tx, typename Ty = Tx>
struct CustomSum {
using Transformer = detail::IdentityFunctor<Tx, Ty>;
using Transformer = kpds::IdentityFunctor<Tx, Ty>;
inline Ty initial() { return static_cast<Ty>(0.0f); }
......@@ -64,7 +66,7 @@ struct CustomSum {
template <typename Tx, typename Ty = Tx>
struct CustomMean {
using Transformer = detail::DivideFunctor<Tx>;
using Transformer = kpds::DivideFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(0.0f); }
......@@ -75,7 +77,7 @@ struct CustomMean {
template <typename Tx, typename Ty = Tx>
struct CustomMul {
using Transformer = detail::IdentityFunctor<Tx>;
using Transformer = kpds::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(1.0f); }
......@@ -86,7 +88,7 @@ struct CustomMul {
template <typename Tx, typename Ty = Tx>
struct CustomLogicalOr {
using Transformer = detail::IdentityFunctor<Tx>;
using Transformer = kpds::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(false); }
......@@ -97,7 +99,7 @@ struct CustomLogicalOr {
template <typename Tx, typename Ty = Tx>
struct CustomLogicalAnd {
using Transformer = detail::IdentityFunctor<Tx>;
using Transformer = kpds::IdentityFunctor<Tx>;
inline Ty initial() { return static_cast<Ty>(true); }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册