From 976606fead5318b4fdec0e51088064b357ac3c55 Mon Sep 17 00:00:00 2001 From: limingshu <61349199+JamesLim-sy@users.noreply.github.com> Date: Tue, 14 Feb 2023 11:08:06 +0800 Subject: [PATCH] Decrease usage of GetVecSize for optimizing host computation efficiency (#50353) * first commit. * a little changes * add some changes for get vec_size efficiently * fix bugs --------- Co-authored-by: zhangbopd <1299246947@qq.com> --- .../fluid/operators/fused/attn_bias_add.cu.h | 9 ++++----- paddle/phi/kernels/funcs/broadcast_function.h | 18 +++++++++--------- paddle/phi/kernels/funcs/elementwise_base.h | 6 ++++-- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 0c8618972e8..53001b24930 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -100,11 +100,10 @@ void LaunchBiasAddFwKernel(const phi::GPUContext& ctx, const T* in0, const T* in1, T* out) { - int in_vec_size = - std::min(phi::GetVectorizedSize(in0), phi::GetVectorizedSize(in1)); - int out_vec_size = std::min(4, phi::GetVectorizedSize(out)); - int vec_size = std::min(out_vec_size, in_vec_size); - + uint64_t addr = + (reinterpret_cast(in0) | reinterpret_cast(in1) | + reinterpret_cast(out)); + int vec_size = phi::GetVectorizedSize(reinterpret_cast(addr)); int numel = m * n; const int threads = 256; const int data_per_thread = 1; diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index cf974bdbe33..2c0aad3a662 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -44,8 +44,7 @@ struct LoaderTypeClassifier { LoaderTypeClassifier() {} LoaderTypeClassifier(const std::vector &ins, std::vector *outs) { - int out_vec_size = - std::min(4, phi::GetVectorizedSize((*outs)[0]->data())); + uint64_t out_addr = reinterpret_cast((*outs)[0]->data()); for (auto i = 1; i < outs->size(); ++i) { PADDLE_ENFORCE_EQ( (*outs)[i]->dims(), @@ -54,10 +53,13 @@ struct LoaderTypeClassifier { "The shape of each output tensor shall be identical yet, but " "%d-th output tensor`s shape is not.", i)); - out_vec_size = std::min( - phi::GetVectorizedSize((*outs)[i]->data()), out_vec_size); + out_addr = + (out_addr | reinterpret_cast((*outs)[i]->data())); } + int out_vec_size = + phi::GetVectorizedSize(reinterpret_cast(out_addr)); + uint64_t in_addr = static_cast(0); numel = (*outs)[0]->numel(); for (int i = 0; i < Arity; ++i) { auto in_data = ins[i]->data(); @@ -66,19 +68,17 @@ struct LoaderTypeClassifier { bool is_same_dim = ins[i]->numel() == numel; if (is_same_dim) { use_broadcast[i] = false; - auto temp_size = phi::GetVectorizedSize(in_data); - in_vec_size = std::min(temp_size, in_vec_size); + in_addr = (in_addr | reinterpret_cast(in_data)); } else { use_broadcast[i] = true; broadcast_num++; } all_elementwise &= is_same_dim; } + int in_vec_size = std::min( + 4, phi::GetVectorizedSize(reinterpret_cast(in_addr))); vec_size = std::min(out_vec_size, in_vec_size); } - - private: - int in_vec_size{4}; }; #ifndef PADDLE_WITH_XPU_KP diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index ffb3ff4ae33..1f937425805 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -567,13 +567,15 @@ int GetVectorizedSizeForTensors(const std::vector &ins, using ArgsT = typename Traits::ArgsTuple; const int Arity = Traits::arity; int vec_size = 4; + uint64_t addr = static_cast(0); ArgsT arg; // The Arg VecSize=1 is to match the Unroller template. Unroller::step(ins, arg, &vec_size); for (auto iter = outs.begin(); iter != outs.end(); ++iter) { - vec_size = - std::min(vec_size, phi::GetVectorizedSize((*iter)->data())); + addr = (addr | reinterpret_cast((*iter)->data())); } + vec_size = std::min( + vec_size, phi::GetVectorizedSize(reinterpret_cast(addr))); #endif return vec_size; } -- GitLab