未验证 提交 d5afc1ba 编写于 作者: S shixingbo 提交者: GitHub

Optimized the performance of activation op in XPU2 (#43187)

上级 9551e466
...@@ -44,7 +44,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x, ...@@ -44,7 +44,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x,
phi::Array<_ptr_ OutT *, 1> out_arr; phi::Array<_ptr_ OutT *, 1> out_arr;
out_arr[0] = y; out_arr[0] = y;
phi::funcs::VectorizedElementwiseKernel<OutT, FunctorT, 1, 1, VecSize> phi::funcs::VectorizedElementwiseKernel<OutT, FunctorT, 1, 1, VecSize>
<<<block, thread, 0, stream>>>(in_arr, out_arr, n, main_offset, <<<block, thread, 0, stream>>>(in_arr, out_arr, n, main_offset, VecSize,
FunctorT()); FunctorT());
} }
......
...@@ -513,19 +513,23 @@ struct Loader { ...@@ -513,19 +513,23 @@ struct Loader {
ArgsT *args, ArgsT *args,
int num, int num,
int data_offset, int data_offset,
int read_lens,
bool is_boundary) { bool is_boundary) {
using Type = std::tuple_element_t<Index, ArgsT>; using Type = std::tuple_element_t<Index, ArgsT>;
kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f)); kps::Init<Type, ArgsT, Index, VecSize>(
args, static_cast<Type>(1.0f), read_lens);
if (is_boundary) { if (is_boundary) {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>( kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
args, args,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset, reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
num); num,
read_lens);
} else { } else {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>( kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
args, args,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset, reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
num); num,
read_lens);
} }
} }
}; };
...@@ -660,11 +664,20 @@ template <typename OutT, ...@@ -660,11 +664,20 @@ template <typename OutT,
typename ArgsT, typename ArgsT,
int Arity> int Arity>
struct SameDimsElementwisePrimitiveCaller { struct SameDimsElementwisePrimitiveCaller {
__device__ inline void operator()(Functor func, ArgsT *args, OutT *result) { __device__ inline void operator()(Functor func,
ArgsT *args,
OutT *result,
int read_lens) {
#ifdef PADDLE_WITH_XPU_KP
for (int idx = 0; idx < read_lens; ++idx) {
result[idx] = static_cast<OutT>(Apply(func, args[idx]));
}
#else
#pragma unroll #pragma unroll
for (int idx = 0; idx < VecSize; ++idx) { for (int idx = 0; idx < VecSize; ++idx) {
result[idx] = static_cast<OutT>(Apply(func, args[idx])); result[idx] = static_cast<OutT>(Apply(func, args[idx]));
} }
#endif
} }
}; };
...@@ -750,6 +763,7 @@ __device__ void VectorizedElementwiseKernelImpl( ...@@ -750,6 +763,7 @@ __device__ void VectorizedElementwiseKernelImpl(
phi::Array<_ptr_ OutT *, NumOuts> outs, phi::Array<_ptr_ OutT *, NumOuts> outs,
int num, int num,
int data_offset, int data_offset,
int read_lens,
Functor func) { Functor func) {
using Traits = paddle::platform::FunctionTraits<Functor>; using Traits = paddle::platform::FunctionTraits<Functor>;
using ArgsT = typename Traits::ArgsTuple; using ArgsT = typename Traits::ArgsTuple;
...@@ -757,16 +771,16 @@ __device__ void VectorizedElementwiseKernelImpl( ...@@ -757,16 +771,16 @@ __device__ void VectorizedElementwiseKernelImpl(
ConditionalT<OutT, NumOuts> result[VecSize]; ConditionalT<OutT, NumOuts> result[VecSize];
Unroller<Loader, VecSize, Arity>::step( Unroller<Loader, VecSize, Arity>::step(
in, args, num, data_offset, IsBoundary); in, args, num, data_offset, read_lens, IsBoundary);
SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>, SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
VecSize, VecSize,
Functor, Functor,
ArgsT, ArgsT,
Arity>()(func, args, result); Arity>()(func, args, result, read_lens);
ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()( ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()(
outs, result, data_offset, num); outs, result, data_offset, num, read_lens);
} }
template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize> template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
...@@ -775,9 +789,10 @@ __global__ void VectorizedElementwiseKernel( ...@@ -775,9 +789,10 @@ __global__ void VectorizedElementwiseKernel(
phi::Array<_ptr_ OutT *, NumOuts> outs, phi::Array<_ptr_ OutT *, NumOuts> outs,
int size, int size,
int main_offset, int main_offset,
int read_lens,
Functor func) { Functor func) {
int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; int data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
for (; data_offset < main_offset; data_offset += stride) { for (; data_offset < main_offset; data_offset += stride) {
VectorizedElementwiseKernelImpl<OutT, VectorizedElementwiseKernelImpl<OutT,
Functor, Functor,
...@@ -785,7 +800,7 @@ __global__ void VectorizedElementwiseKernel( ...@@ -785,7 +800,7 @@ __global__ void VectorizedElementwiseKernel(
NumOuts, NumOuts,
VecSize, VecSize,
false>( false>(
ins, outs, VecSize * BLOCK_NUM_X, data_offset, func); ins, outs, read_lens * BLOCK_NUM_X, data_offset, read_lens, func);
} }
int num = size - data_offset; int num = size - data_offset;
...@@ -795,7 +810,8 @@ __global__ void VectorizedElementwiseKernel( ...@@ -795,7 +810,8 @@ __global__ void VectorizedElementwiseKernel(
Arity, Arity,
NumOuts, NumOuts,
VecSize, VecSize,
true>(ins, outs, num, data_offset, func); true>(
ins, outs, num, data_offset, read_lens, func);
} }
} }
...@@ -803,6 +819,7 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize> ...@@ -803,6 +819,7 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
void ElementwiseCudaKernel(const KPDevice &ctx, void ElementwiseCudaKernel(const KPDevice &ctx,
const std::vector<const DenseTensor *> &ins, const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs, std::vector<DenseTensor *> *outs,
int read_lens,
Functor func) { Functor func) {
auto numel = auto numel =
(*outs)[0]->numel(); // To avoid running errors when ins.size()== 0 (*outs)[0]->numel(); // To avoid running errors when ins.size()== 0
...@@ -817,10 +834,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ...@@ -817,10 +834,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
int block_size = 64; int block_size = 64;
int grid_size = 8; int grid_size = 8;
auto stream = ctx.x_context()->xpu_stream; auto stream = ctx.x_context()->xpu_stream;
int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size; int main_offset = (numel / (read_lens * block_size)) * read_lens * block_size;
VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize> VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
<<<grid_size, block_size, 0, stream>>>( <<<grid_size, block_size, 0, stream>>>(
ins_data, outs_data, numel, main_offset, func); ins_data, outs_data, numel, main_offset, read_lens, func);
#else #else
auto gpu_config = auto gpu_config =
phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize); phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
...@@ -829,7 +846,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx, ...@@ -829,7 +846,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
auto stream = ctx.stream(); auto stream = ctx.stream();
VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize> VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
<<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>( <<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
ins_data, outs_data, numel, main_offset, func); ins_data, outs_data, numel, main_offset, VecSize, func);
#endif #endif
} }
...@@ -868,20 +885,32 @@ void ElementwiseKernel(const KPDevice &ctx, ...@@ -868,20 +885,32 @@ void ElementwiseKernel(const KPDevice &ctx,
} }
} }
#ifdef PADDLE_WITH_XPU_KP
const int buf_size = 256;
int numel = (*outs)[0]->numel();
int block_size = 64;
int grid_size = 8;
int nthreads = block_size * grid_size;
int read_lens =
std::min(buf_size, kps::details::RoundUpDiv(numel, 32 * nthreads) * 32);
int vec_size = buf_size;
#else
// calculate the max vec_size for all ins and outs // calculate the max vec_size for all ins and outs
int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs); int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs);
int read_lens = vec_size;
#endif
switch (vec_size) { switch (vec_size) {
case 4: case VecSizeL:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 4>( ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
ctx, ins, outs, func); ctx, ins, outs, read_lens, func);
break; break;
case 2: case VecSizeM:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 2>( ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
ctx, ins, outs, func); ctx, ins, outs, read_lens, func);
break; break;
case 1: case VecSizeS:
ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 1>( ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
ctx, ins, outs, func); ctx, ins, outs, read_lens, func);
break; break;
default: { default: {
PADDLE_THROW(phi::errors::Unimplemented( PADDLE_THROW(phi::errors::Unimplemented(
......
...@@ -259,7 +259,7 @@ __device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) { ...@@ -259,7 +259,7 @@ __device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) {
* it supports different data types of inputs. * it supports different data types of inputs.
*/ */
template <typename T, typename ArgsT, int Index, int NX> template <typename T, typename ArgsT, int Index, int NX>
__device__ __forceinline__ void Init(ArgsT* dst, T init_data) { __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
#pragma unroll #pragma unroll
for (int i = 0; i < NX; i++) { for (int i = 0; i < NX; i++) {
std::get<Index>(dst[i]) = init_data; std::get<Index>(dst[i]) = init_data;
...@@ -382,7 +382,8 @@ template <typename T, ...@@ -382,7 +382,8 @@ template <typename T,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __forceinline__ void ReadData(ArgsT* dst, __device__ __forceinline__ void ReadData(ArgsT* dst,
const T* __restrict__ src, const T* __restrict__ src,
int num) { int num,
int read_lens) {
if (IsBoundary) { // blockDim.x * NX > num if (IsBoundary) { // blockDim.x * NX > num
int thread_offset = threadIdx.x * NX; int thread_offset = threadIdx.x * NX;
#pragma unroll #pragma unroll
......
...@@ -21,6 +21,8 @@ namespace phi { ...@@ -21,6 +21,8 @@ namespace phi {
namespace kps { namespace kps {
namespace details { namespace details {
int RoundUpDiv(int n, int k) { return (n + k - 1) / k; }
enum class OptType { // Optimize type of calc after input shape compressed enum class OptType { // Optimize type of calc after input shape compressed
CanNotOptimize = -1, // can not optimize, broadcast first CanNotOptimize = -1, // can not optimize, broadcast first
N_1, // just like {1} op {100} or {100} op {1} N_1, // just like {1} op {100} or {100} op {1}
...@@ -425,9 +427,10 @@ __device__ __inline__ void Init(T* dst, T init_data, int read_lens) { ...@@ -425,9 +427,10 @@ __device__ __inline__ void Init(T* dst, T init_data, int read_lens) {
* it supports different data types of inputs. * it supports different data types of inputs.
*/ */
template <typename T, typename ArgsT, int Index, int NX> template <typename T, typename ArgsT, int Index, int NX>
__device__ __forceinline__ void Init(ArgsT* dst, T init_data) { __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
mfence();
#pragma unroll #pragma unroll
for (int i = 0; i < NX; i++) { for (int i = 0; i < read_lens; i++) {
std::get<Index>(dst[i]) = init_data; std::get<Index>(dst[i]) = init_data;
} }
} }
...@@ -523,22 +526,24 @@ template <typename T, ...@@ -523,22 +526,24 @@ template <typename T,
bool IsBoundary> bool IsBoundary>
__device__ __forceinline__ void ReadData(ArgsT* dst, __device__ __forceinline__ void ReadData(ArgsT* dst,
const T _global_ptr_* src, const T _global_ptr_* src,
int num) { int num,
int thread_offset = core_id() * NX; int read_lens) {
int thread_offset = core_id() * read_lens;
__local__ T in_temp[1]; __local__ T in_temp[1];
__local__ T in_vec[NX]; __local__ T in_vec[NX];
if (IsBoundary) { // core_num() * NX > num if (IsBoundary) { // core_num() * read_lens > num
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX; ++idx) { for (int idx = 0; idx < read_lens; ++idx) {
if (idx + thread_offset < num) { if (idx + thread_offset < num) {
GM2LM(src + thread_offset + idx, in_temp, sizeof(T)); GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
std::get<Index>(dst[idx]) = in_temp[0]; std::get<Index>(dst[idx]) = in_temp[0];
mfence();
} }
} }
} else { // core_num() * NX < num } else { // core_num() * read_lens < num
GM2LM(src + thread_offset, in_vec, NX * sizeof(T)); GM2LM(src + thread_offset, in_vec, read_lens * sizeof(T));
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX; ++idx) { for (int idx = 0; idx < read_lens; ++idx) {
std::get<Index>(dst[idx]) = in_vec[idx]; std::get<Index>(dst[idx]) = in_vec[idx];
} }
} }
...@@ -727,10 +732,12 @@ __device__ void WriteData(T _global_ptr_* dst, ...@@ -727,10 +732,12 @@ __device__ void WriteData(T _global_ptr_* dst,
for (int idx = 0; idx < read_lens; ++idx) { for (int idx = 0; idx < read_lens; ++idx) {
if (idx + thread_offset < num) { if (idx + thread_offset < num) {
in_temp[0] = src[idx]; in_temp[0] = src[idx];
mfence();
LM2GM(in_temp, dst + idx + thread_offset, sizeof(T)); LM2GM(in_temp, dst + idx + thread_offset, sizeof(T));
} }
} }
} else { // core_num() * read_lens < num } else { // core_num() * read_lens < num
mfence();
LM2GM(src, dst + thread_offset, read_lens * sizeof(T)); LM2GM(src, dst + thread_offset, read_lens * sizeof(T));
} }
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册