diff --git a/paddle/fluid/operators/norm_op.cc b/paddle/fluid/operators/norm_op.cc index 1fc51e76e2540519e203a36874108bef02ab66c9..5880141520fa130454858f3f36661c969bdb4502 100644 --- a/paddle/fluid/operators/norm_op.cc +++ b/paddle/fluid/operators/norm_op.cc @@ -88,7 +88,11 @@ class NormOpGradOpMaker : public framework::SingleGradOpMaker { op->SetAttrMap(this->Attrs()); op->SetInput("X", this->Input("X")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); +#ifndef PADDLE_WITH_ASCEND_CL op->SetInput("Norm", this->Output("Norm")); +#else + op->SetInput("Out", this->Output("Out")); +#endif op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); } }; diff --git a/paddle/fluid/operators/norm_op_npu.cc b/paddle/fluid/operators/norm_op_npu.cc index 17b0fca2bb0971ee6229af721c89fc98ada758ac..ca2eac06c724720a465ba23c5f5deab77a7da553 100644 --- a/paddle/fluid/operators/norm_op_npu.cc +++ b/paddle/fluid/operators/norm_op_npu.cc @@ -15,24 +15,26 @@ limitations under the License. */ namespace paddle { namespace operators { +using DDim = framework::DDim; +using Tensor = framework::Tensor; + +void CheckAxis(int axis, int rank) { + // check the axis is in [-rank, rank-1] + if (axis <= rank - 1 && axis >= -rank) return; + PADDLE_THROW(platform::errors::InvalidArgument( + "axis in norm operator must between (%d) and (%d)" + "but got (%d).", + -rank, rank - 1, axis)); +} + template class NormNPUKernel : public framework::OpKernel { - private: - void CheckAxis(int axis, int rank) const { - // check the axis is in [-rank, rank-1] - if (axis <= rank - 1 && axis >= -rank) return; - PADDLE_THROW(platform::errors::InvalidArgument( - "axis in norm operator must between (%d) and (%d)" - "but got (%d).", - -rank, rank - 1, axis)); - } - public: - void Compute(const framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext &ctx) const override { VLOG(4) << "Launch Norm Op Kernel on NPU." << std::endl; - auto* in_x = ctx.Input("X"); - auto* out_y = ctx.Output("Out"); - auto* out_norm = ctx.Output("Norm"); + auto *in_x = ctx.Input("X"); + auto *out_y = ctx.Output("Out"); + auto *out_norm = ctx.Output("Norm"); out_y->mutable_data(ctx.GetPlace()); out_norm->mutable_data(ctx.GetPlace()); auto xdim = in_x->dims(); @@ -46,7 +48,7 @@ class NormNPUKernel : public framework::OpKernel { attr_input_norm["p"] = 2; attr_input_norm["keepdim"] = true; attr_input_norm["epsilon"] = eps; - const auto& runner = + const auto &runner = NpuOpRunner("LpNorm", {*in_x}, {*out_norm}, attr_input_norm); auto stream = ctx.template device_context() @@ -56,12 +58,48 @@ class NormNPUKernel : public framework::OpKernel { } }; +template +class NormGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + float epsilon = ctx.Attr("epsilon"); + int axis = ctx.Attr("axis"); + + auto *x = ctx.Input("X"); + auto *y = ctx.Input("Out"); + auto *dy = ctx.Input(framework::GradVarName("Out")); + auto *dx = ctx.Output(framework::GradVarName("X")); + + auto xdim = x->dims(); + CheckAxis(axis, xdim.size()); + + auto place = ctx.GetPlace(); + + dx->mutable_data(place); + + framework::NPUAttributeMap attr_input_norm; + attr_input_norm["dim"] = std::vector({axis}); + attr_input_norm["eps"] = epsilon; + const auto &runner = + NpuOpRunner("L2NormalizeGrad", {*x, *y, *dy}, {*dx}, attr_input_norm); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; + REGISTER_OP_NPU_KERNEL( norm, ops::NormNPUKernel, ops::NormNPUKernel) + +REGISTER_OP_NPU_KERNEL( + norm_grad, ops::NormGradNPUKernel, + ops::NormGradNPUKernel); diff --git a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py index 2c946bb893127a75b819353ede8136e8ab6c3c0f..2c41f09ff51488dd8e6eff48fa0dec0a6917bf50 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_norm_op_npu.py @@ -20,26 +20,18 @@ import unittest import numpy as np import paddle import paddle.fluid as fluid -from op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.test_norm_op import l2_norm -SEED = 2021 - -def l2_norm(x, axis, epsilon): - x2 = x**2 - s = np.sum(x2, axis=axis, keepdims=True) - r = np.sqrt(s) + epsilon - y = x / np.broadcast_to(r, x.shape) - return y, r - - -class TestNorm(OpTest): +class TestNPUNormOp(OpTest): def setUp(self): paddle.enable_static() self.set_npu() self.place = paddle.NPUPlace(0) self.op_type = "norm" self.init_dtype() + self.init_test_case() x = np.random.random(self.shape).astype(self.dtype) y, norm = l2_norm(x, self.axis, self.epsilon) @@ -52,6 +44,8 @@ class TestNorm(OpTest): def init_dtype(self): self.dtype = np.float32 + + def init_test_case(self): self.axis = 1 self.epsilon = 1e-10 self.shape = (2, 3, 4, 5) @@ -59,29 +53,50 @@ class TestNorm(OpTest): def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + if self.dtype == np.float16: + return -class TestNormOp2(TestNorm): + self.check_grad_with_place( + self.place, ['X'], 'Out', max_relative_error=0.006) + + +class TestNPUNormOp2(TestNPUNormOp): def init_test_case(self): self.shape = [5, 3, 9, 7] self.axis = 0 self.epsilon = 1e-8 - self.dtype = np.float32 -class TestNormOp3(TestNorm): +class TestNPUNormOp3(TestNPUNormOp): def init_test_case(self): self.shape = [5, 3, 2, 7] self.axis = -1 self.epsilon = 1e-8 - self.dtype = np.float32 -class TestNormOp4(TestNorm): +@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " + + "however it is desirable to cover the forward pass") +class TestNPUNormOp4(TestNPUNormOp): def init_test_case(self): self.shape = [128, 1024, 14, 14] self.axis = 2 self.epsilon = 1e-8 - self.dtype = np.float32 + + def test_check_grad(self): + pass + + +@skip_check_grad_ci(reason="'check_grad' on large inputs is too slow, " + + "however it is desirable to cover the forward pass") +class TestNPUNormOp5(TestNPUNormOp): + def init_test_case(self): + self.shape = [2048, 2048] + self.axis = 1 + self.epsilon = 1e-8 + + def test_check_grad(self): + pass class API_NormTest(unittest.TestCase): @@ -96,13 +111,15 @@ class API_NormTest(unittest.TestCase): self.assertRaises(TypeError, test_norm_x_type) -class TestNormFP16(TestNorm): +class TestNPUNormOpFP16(TestNPUNormOp): def set_npu(self): self.__class__.use_npu = True self.__class__.no_need_check_grad = True def init_dtype(self): self.dtype = np.float16 + + def init_test_case(self): self.axis = -1 self.epsilon = 1e-10 self.shape = (2, 3, 100)