未验证 提交 91ba86b1 编写于 作者: R ronnywang 提交者: GitHub

[NPU] Fix the performance problem when 'axis' is not specified (#35116)

上级 763b6d91
...@@ -42,27 +42,22 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> { ...@@ -42,27 +42,22 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
auto y_dims = y->dims(); auto y_dims = y->dims();
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
if (x_dims.size() >= y_dims.size()) { if (x_dims.size() >= y_dims.size()) {
direct_compute = direct_compute = x_dims.size() == (y_dims.size() + axis);
y_dims == framework::slice_ddim(x_dims, axis, x_dims.size());
} else { } else {
direct_compute = direct_compute = y_dims.size() == (x_dims.size() + axis);
x_dims == framework::slice_ddim(y_dims, axis, y_dims.size());
} }
Tensor transformed_x, transformed_y;
if (direct_compute) { if (direct_compute) {
transformed_x.ShareDataWith(*x); const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {});
transformed_y.ShareDataWith(*y); runner.Run(dev_ctx.stream());
} else { } else {
Tensor transformed_x, transformed_y;
NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &transformed_x, NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &transformed_x,
&transformed_y); &transformed_y);
const auto& runner =
NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
runner.Run(dev_ctx.stream());
} }
const auto& runner =
NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {});
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
runner.Run(stream);
} }
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册