未验证 提交 976606fe 编写于 作者: L limingshu 提交者: GitHub

Decrease usage of GetVecSize for optimizing host computation efficiency (#50353)

* first commit.

* a little changes

* add some changes for get vec_size efficiently

* fix bugs

---------
Co-authored-by: Nzhangbopd <1299246947@qq.com>
上级 2548657e
......@@ -100,11 +100,10 @@ void LaunchBiasAddFwKernel(const phi::GPUContext& ctx,
const T* in0,
const T* in1,
T* out) {
int in_vec_size =
std::min(phi::GetVectorizedSize<T>(in0), phi::GetVectorizedSize<T>(in1));
int out_vec_size = std::min(4, phi::GetVectorizedSize<T>(out));
int vec_size = std::min(out_vec_size, in_vec_size);
uint64_t addr =
(reinterpret_cast<uint64_t>(in0) | reinterpret_cast<uint64_t>(in1) |
reinterpret_cast<uint64_t>(out));
int vec_size = phi::GetVectorizedSize<T>(reinterpret_cast<T*>(addr));
int numel = m * n;
const int threads = 256;
const int data_per_thread = 1;
......
......@@ -44,8 +44,7 @@ struct LoaderTypeClassifier {
LoaderTypeClassifier() {}
LoaderTypeClassifier(const std::vector<const DenseTensor *> &ins,
std::vector<DenseTensor *> *outs) {
int out_vec_size =
std::min(4, phi::GetVectorizedSize<OutT>((*outs)[0]->data<OutT>()));
uint64_t out_addr = reinterpret_cast<uint64_t>((*outs)[0]->data<OutT>());
for (auto i = 1; i < outs->size(); ++i) {
PADDLE_ENFORCE_EQ(
(*outs)[i]->dims(),
......@@ -54,10 +53,13 @@ struct LoaderTypeClassifier {
"The shape of each output tensor shall be identical yet, but "
"%d-th output tensor`s shape is not.",
i));
out_vec_size = std::min(
phi::GetVectorizedSize<OutT>((*outs)[i]->data<OutT>()), out_vec_size);
out_addr =
(out_addr | reinterpret_cast<uint64_t>((*outs)[i]->data<OutT>()));
}
int out_vec_size =
phi::GetVectorizedSize<OutT>(reinterpret_cast<OutT *>(out_addr));
uint64_t in_addr = static_cast<uint64_t>(0);
numel = (*outs)[0]->numel();
for (int i = 0; i < Arity; ++i) {
auto in_data = ins[i]->data<InT>();
......@@ -66,19 +68,17 @@ struct LoaderTypeClassifier {
bool is_same_dim = ins[i]->numel() == numel;
if (is_same_dim) {
use_broadcast[i] = false;
auto temp_size = phi::GetVectorizedSize<InT>(in_data);
in_vec_size = std::min(temp_size, in_vec_size);
in_addr = (in_addr | reinterpret_cast<uint64_t>(in_data));
} else {
use_broadcast[i] = true;
broadcast_num++;
}
all_elementwise &= is_same_dim;
}
int in_vec_size = std::min(
4, phi::GetVectorizedSize<InT>(reinterpret_cast<InT *>(in_addr)));
vec_size = std::min(out_vec_size, in_vec_size);
}
private:
int in_vec_size{4};
};
#ifndef PADDLE_WITH_XPU_KP
......
......@@ -567,13 +567,15 @@ int GetVectorizedSizeForTensors(const std::vector<const DenseTensor *> &ins,
using ArgsT = typename Traits::ArgsTuple;
const int Arity = Traits::arity;
int vec_size = 4;
uint64_t addr = static_cast<uint64_t>(0);
ArgsT arg;
// The Arg VecSize=1 is to match the Unroller template.
Unroller<VecSizeGetter, 1, Arity>::step(ins, arg, &vec_size);
for (auto iter = outs.begin(); iter != outs.end(); ++iter) {
vec_size =
std::min<int>(vec_size, phi::GetVectorizedSize((*iter)->data<OutT>()));
addr = (addr | reinterpret_cast<uint64_t>((*iter)->data<OutT>()));
}
vec_size = std::min(
vec_size, phi::GetVectorizedSize<OutT>(reinterpret_cast<OutT *>(addr)));
#endif
return vec_size;
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册