未验证 提交 976606fe 编写于 作者: L limingshu 提交者: GitHub

Decrease usage of GetVecSize for optimizing host computation efficiency (#50353)

* first commit.

* a little changes

* add some changes for get vec_size efficiently

* fix bugs

---------
Co-authored-by: Nzhangbopd <1299246947@qq.com>
上级 2548657e
...@@ -100,11 +100,10 @@ void LaunchBiasAddFwKernel(const phi::GPUContext& ctx, ...@@ -100,11 +100,10 @@ void LaunchBiasAddFwKernel(const phi::GPUContext& ctx,
const T* in0, const T* in0,
const T* in1, const T* in1,
T* out) { T* out) {
int in_vec_size = uint64_t addr =
std::min(phi::GetVectorizedSize<T>(in0), phi::GetVectorizedSize<T>(in1)); (reinterpret_cast<uint64_t>(in0) | reinterpret_cast<uint64_t>(in1) |
int out_vec_size = std::min(4, phi::GetVectorizedSize<T>(out)); reinterpret_cast<uint64_t>(out));
int vec_size = std::min(out_vec_size, in_vec_size); int vec_size = phi::GetVectorizedSize<T>(reinterpret_cast<T*>(addr));
int numel = m * n; int numel = m * n;
const int threads = 256; const int threads = 256;
const int data_per_thread = 1; const int data_per_thread = 1;
......
...@@ -44,8 +44,7 @@ struct LoaderTypeClassifier { ...@@ -44,8 +44,7 @@ struct LoaderTypeClassifier {
LoaderTypeClassifier() {} LoaderTypeClassifier() {}
LoaderTypeClassifier(const std::vector<const DenseTensor *> &ins, LoaderTypeClassifier(const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs) { std::vector<DenseTensor *> *outs) {
int out_vec_size = uint64_t out_addr = reinterpret_cast<uint64_t>((*outs)[0]->data<OutT>());
std::min(4, phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>()));
for (auto i = 1; i < outs->size(); ++i) { for (auto i = 1; i < outs->size(); ++i) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
(*outs)[i]->dims(), (*outs)[i]->dims(),
...@@ -54,10 +53,13 @@ struct LoaderTypeClassifier { ...@@ -54,10 +53,13 @@ struct LoaderTypeClassifier {
"The shape of each output tensor shall be identical yet, but " "The shape of each output tensor shall be identical yet, but "
"%d-th output tensor`s shape is not.", "%d-th output tensor`s shape is not.",
i)); i));
out_vec_size = std::min( out_addr =
phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size); (out_addr | reinterpret_cast<uint64_t>((*outs)[i]->data<OutT>()));
} }
int out_vec_size =
phi::GetVectorizedSize<OutT>(reinterpret_cast<OutT *>(out_addr));
uint64_t in_addr = static_cast<uint64_t>(0);
numel = (*outs)[0]->numel(); numel = (*outs)[0]->numel();
for (int i = 0; i < Arity; ++i) { for (int i = 0; i < Arity; ++i) {
auto in_data = ins[i]->data<InT>(); auto in_data = ins[i]->data<InT>();
...@@ -66,19 +68,17 @@ struct LoaderTypeClassifier { ...@@ -66,19 +68,17 @@ struct LoaderTypeClassifier {
bool is_same_dim = ins[i]->numel() == numel; bool is_same_dim = ins[i]->numel() == numel;
if (is_same_dim) { if (is_same_dim) {
use_broadcast[i] = false; use_broadcast[i] = false;
auto temp_size = phi::GetVectorizedSize<InT>(in_data); in_addr = (in_addr | reinterpret_cast<uint64_t>(in_data));
in_vec_size = std::min(temp_size, in_vec_size);
} else { } else {
use_broadcast[i] = true; use_broadcast[i] = true;
broadcast_num++; broadcast_num++;
} }
all_elementwise &= is_same_dim; all_elementwise &= is_same_dim;
} }
int in_vec_size = std::min(
4, phi::GetVectorizedSize<InT>(reinterpret_cast<InT *>(in_addr)));
vec_size = std::min(out_vec_size, in_vec_size); vec_size = std::min(out_vec_size, in_vec_size);
} }
private:
int in_vec_size{4};
}; };
#ifndef PADDLE_WITH_XPU_KP #ifndef PADDLE_WITH_XPU_KP
......
...@@ -567,13 +567,15 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins, ...@@ -567,13 +567,15 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
using ArgsT = typename Traits::ArgsTuple; using ArgsT = typename Traits::ArgsTuple;
const int Arity = Traits::arity; const int Arity = Traits::arity;
int vec_size = 4; int vec_size = 4;
uint64_t addr = static_cast<uint64_t>(0);
ArgsT arg; ArgsT arg;
// The Arg VecSize=1 is to match the Unroller template. // The Arg VecSize=1 is to match the Unroller template.
Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size); Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
for (auto iter = outs.begin(); iter != outs.end(); ++iter) { for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
vec_size = addr = (addr | reinterpret_cast<uint64_t>((*iter)->data<OutT>()));
std::min<int>(vec_size, phi::GetVectorizedSize((*iter)->data<OutT>()));
} }
vec_size = std::min(
vec_size, phi::GetVectorizedSize<OutT>(reinterpret_cast<OutT *>(addr)));
#endif #endif
return vec_size; return vec_size;
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册