未验证 提交 8497e2aa 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] add npu kernel for elementwise_add_grad (#31347)

* fix reading flags from env

* fix problem caused by async run

* support partial grad

* support elementwise_add_grad npu kernel

* add unittest

* fix bug?
上级 9fcdaeba
...@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,17 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename T>
class ElementwiseAddNPUKernel : public framework::OpKernel<T> { class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
...@@ -39,12 +40,127 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> { ...@@ -39,12 +40,127 @@ class ElementwiseAddNPUKernel : public framework::OpKernel<T> {
} }
}; };
template <typename T>
class ElementwiseAddGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
auto stream =
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
// default axis=-1?
// So, the sub_grad should do reduce if needed.
// For example, the shape of each variable in elementwise_sub:
// x, dx: [2, 3, 5]
// y, dy: [1, 5]
// out, dout: [2, 3, 5]
// Then, out = x - y => dx = dout, dy = -dout
// And, the shape of dy can be computed by two stages reduce,
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
// For dx
// stage 1
auto reduce_ndim = dout->dims().size() - dx->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
Tensor* tmp_dout = const_cast<Tensor*>(dout);
Tensor reduced_dout(dx->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
for (auto i = 0; i < dx->dims().size(); ++i) {
if (dx->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
}
}
if (dy) {
// For dy
// stage 1
auto reduce_ndim = dout->dims().size() - dy->dims().size();
std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
Tensor* tmp_dout = const_cast<Tensor*>(dout);
Tensor reduced_dout(dout->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
}
// stage 2
axes.clear();
for (auto i = 0; i < dy->dims().size(); ++i) {
if (dy->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
dy->mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
ctx.template device_context<paddle::platform::NPUDeviceContext>()
.Wait();
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy);
}
}
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel<float>,
ops::ElementwiseAddNPUKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL(elementwise_add_grad,
elementwise_add, ops::ElementwiseAddGradNPUKernel<float>,
ops::ElementwiseAddNPUKernel<paddle::platform::NPUDeviceContext, float>); ops::ElementwiseAddGradNPUKernel<plat::float16>);
#endif
...@@ -74,6 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -74,6 +74,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
{{"Out", {"Out"}}}, attrs); {{"Out", {"Out"}}}, attrs);
op->Run(*scope, place); op->Run(*scope, place);
ctx.Wait();
std::vector<T> out_vec; std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec); TensorToVector(*tensor_out, ctx, &out_vec);
...@@ -125,57 +126,64 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -125,57 +126,64 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
// run // run
f::AttributeMap attrs; f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp(op_type, auto op = f::OpRegistry::CreateOp(
{{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}}, op_type, {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}, {"Y", {"Y"}}},
{{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs); {{"X@GRAD", {"DX"}}, {"Y@GRAD", {"DY"}}}, attrs);
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
op->Run(*scope, place); op->Run(*scope, place);
ctx.Wait();
std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec); std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec);
std::vector<T> dy_vec;
TensorToVector(*tensor_dy, ctx, &dy_vec); std::vector<T> dy_vec;
TensorToVector(*tensor_dy, ctx, &dy_vec);
ctx.Wait();
float expected_x, expected_y; ctx.Wait();
if (op_type == "elementwise_add_grad") { float expected_x, expected_y;
expected_x = 1.0; if (op_type == "elementwise_add_grad") {
expected_y = 6.0; expected_x = 1.0;
} else if (op_type == "elementwise_sub_grad") { expected_y = 6.0;
expected_x = 1.0; } else if (op_type == "elementwise_sub_grad") {
expected_y = -6.0; expected_x = 1.0;
} expected_y = -6.0;
}
for (uint32_t i = 0; i < dx_vec.size(); i++) {
EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x)); for (uint32_t i = 0; i < dx_vec.size(); i++) {
} EXPECT_EQ(dx_vec[i], static_cast<T>(expected_x));
for (uint32_t i = 0; i < dy_vec.size(); i++) { }
EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y)); for (uint32_t i = 0; i < dy_vec.size(); i++) {
} EXPECT_EQ(dy_vec[i], static_cast<T>(expected_y));
}
} }
TEST(elementwise_add, NPU_fp32) { TEST(elementwise_add, NPU_fp32) {
f::Scope scope; f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0)); p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_add"); Compare<float>(&scope, ctx, "elementwise_add");
} }
TEST(elementwise_sub, NPU_fp32) { TEST(elementwise_sub, NPU_fp32) {
f::Scope scope; f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0)); p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<float>(&scope, ctx, "elementwise_sub"); Compare<float>(&scope, ctx, "elementwise_sub");
} }
TEST(elementwise_sub, NPU_fp16) { TEST(elementwise_sub, NPU_fp16) {
f::Scope scope; f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0)); p::NPUDeviceContext ctx(p::NPUPlace(0));
Compare<p::float16>(&scope, ctx, "elementwise_sub"); Compare<p::float16>(&scope, ctx, "elementwise_sub");
} }
TEST(elementwise_sub_grad, NPU) { TEST(elementwise_sub_grad, NPU) {
f::Scope scope; f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0)); p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "elementwise_sub_grad"); CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
}
TEST(elementwise_add_grad, NPU) {
f::Scope scope;
p::NPUDeviceContext ctx(p::NPUPlace(0));
CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
} }
...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -24,7 +23,7 @@ namespace operators { ...@@ -24,7 +23,7 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
template <typename DeviceContext, typename T> template <typename T>
class ElementwiseSubNPUKernel : public framework::OpKernel<T> { class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
...@@ -43,7 +42,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel<T> { ...@@ -43,7 +42,7 @@ class ElementwiseSubNPUKernel : public framework::OpKernel<T> {
} }
}; };
template <typename DeviceContext, typename T> template <typename T>
class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
...@@ -51,8 +50,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { ...@@ -51,8 +50,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
dx->mutable_data<T>(ctx.GetPlace()); auto stream =
dy->mutable_data<T>(ctx.GetPlace()); ctx.template device_context<paddle::platform::NPUDeviceContext>()
.stream();
// NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with
// default axis=-1? // default axis=-1?
...@@ -66,89 +66,92 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { ...@@ -66,89 +66,92 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
// 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false. // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false.
// 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true. // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true.
auto stream = if (dx) {
ctx.template device_context<paddle::platform::NPUDeviceContext>() dx->mutable_data<T>(ctx.GetPlace());
.stream(); // For dx
// For dx // stage 1
// stage 1 auto reduce_ndim = dout->dims().size() - dx->dims().size();
auto reduce_ndim = dout->dims().size() - dx->dims().size(); std::vector<int> axes;
std::vector<int> axes; for (auto i = 0; i < reduce_ndim; ++i) {
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i);
}
Tensor* tmp_dout = const_cast<Tensor*>(dout);
Tensor reduced_dout(dx->type());
if (axes.size() != 0) {
std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
for (auto i = 0; i < dx->dims().size(); ++i) {
if (dx->dims()[i] == 1) {
axes.push_back(i); axes.push_back(i);
} }
} Tensor* tmp_dout = const_cast<Tensor*>(dout);
if (axes.size() != 0) { Tensor reduced_dout(dx->type());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, if (axes.size() != 0) {
{{"axes", axes}, {"keep_dims", true}}); std::vector<int64_t> reduced_dout_dims;
runner.Run(stream); for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
} else { reduced_dout_dims.push_back(dout->dims()[i]);
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); }
} reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
// For dy auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
// stage 1 {{"axes", axes}, {"keep_dims", false}});
reduce_ndim = dout->dims().size() - dy->dims().size(); runner.Run(stream);
axes.clear(); tmp_dout = &reduced_dout;
for (auto i = 0; i < reduce_ndim; ++i) { }
axes.push_back(i);
}
tmp_dout = const_cast<Tensor*>(dout);
Tensor reduced_dy(dy->type());
if (axes.size() != 0) { // stage 2
std::vector<int64_t> reduced_dout_dims; axes.clear();
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { for (auto i = 0; i < dx->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]); if (dx->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
} else {
framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
} }
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
} }
if (dy) {
// stage 2 dy->mutable_data<T>(ctx.GetPlace());
axes.clear(); // For dy
Tensor* tmp_dy = tmp_dout; // stage 1
for (auto i = 0; i < dy->dims().size(); ++i) { auto reduce_ndim = dout->dims().size() - dy->dims().size();
if (dy->dims()[i] == 1) { std::vector<int> axes;
for (auto i = 0; i < reduce_ndim; ++i) {
axes.push_back(i); axes.push_back(i);
} }
} Tensor* tmp_dout = const_cast<Tensor*>(dout);
if (axes.size() != 0) { Tensor reduced_dy(dy->type());
reduced_dy.Resize(dy->dims()); Tensor reduced_dout(dy->type());
reduced_dy.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy}, if (axes.size() != 0) {
{{"axes", axes}, {"keep_dims", true}}); std::vector<int64_t> reduced_dout_dims;
for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
reduced_dout_dims.push_back(dout->dims()[i]);
}
reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
reduced_dout.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
{{"axes", axes}, {"keep_dims", false}});
runner.Run(stream);
tmp_dout = &reduced_dout;
}
// stage 2
axes.clear();
Tensor* tmp_dy = tmp_dout;
for (auto i = 0; i < dy->dims().size(); ++i) {
if (dy->dims()[i] == 1) {
axes.push_back(i);
}
}
if (axes.size() != 0) {
reduced_dy.Resize(dy->dims());
reduced_dy.mutable_data<T>(ctx.GetPlace());
auto runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {reduced_dy},
{{"axes", axes}, {"keep_dims", true}});
runner.Run(stream);
tmp_dy = &reduced_dy;
}
// stage 3, negative
auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
runner.Run(stream); runner.Run(stream);
tmp_dy = &reduced_dy;
} }
// stage 3, negative
auto runner = NpuOpRunner("Neg", {*tmp_dy}, {*dy}, {});
runner.Run(stream);
} }
}; };
...@@ -156,16 +159,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> { ...@@ -156,16 +159,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel<float>,
ops::ElementwiseSubNPUKernel<plat::float16>);
REGISTER_OP_NPU_KERNEL( REGISTER_OP_NPU_KERNEL(elementwise_sub_grad,
elementwise_sub, ops::ElementwiseSubGradNPUKernel<float>,
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext, float>, ops::ElementwiseSubGradNPUKernel<plat::float16>);
ops::ElementwiseSubNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(
elementwise_sub_grad,
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
ops::ElementwiseSubGradNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>);
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册