From fa5ddfd9511a550010a373e632b876fcc36673b3 Mon Sep 17 00:00:00 2001 From: pangyoki Date: Mon, 5 Jul 2021 17:17:57 +0800 Subject: [PATCH] [NPU] change Add to AddN in sum npu op (#33957) * change Add to AddN in sum npu op * add AddInputNames * change fp16 to fp32 because numpy has accuracy loss in fp16 adding * delete check * fix runner error --- paddle/fluid/operators/sum_op_npu.cc | 21 ++++++++++++------- .../tests/unittests/npu/test_sum_op_npu.py | 9 +++++++- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc index cbeb6285b6..a6032236c0 100644 --- a/paddle/fluid/operators/sum_op_npu.cc +++ b/paddle/fluid/operators/sum_op_npu.cc @@ -35,23 +35,28 @@ class SumNPUKernel : public framework::OpKernel { auto place = ctx.GetPlace(); int n = static_cast(x.size()); - if (n == 1) { TensorCopy(*x[0], place, out); return; } + std::vector inputs; + std::vector names; + for (int i = 0; i < n; ++i) { + if (x[i] && x[i]->numel() > 0) { + inputs.push_back(*x[i]); + names.push_back("x" + std::to_string(i)); + } else { + continue; + } + } + auto stream = ctx.template device_context() .stream(); - - const auto& runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {}); - + NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}}; + runner.AddInputNames(names); runner.Run(stream); - for (int i = 2; i < n; i++) { - const auto& runner1 = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {}); - runner1.Run(stream); - } } }; diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py index 2ad6cc388f..21b42814c0 100755 --- a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py @@ -67,7 +67,14 @@ class TestSum2(OpTest): x2 = np.random.random((3, 3)).astype(self.dtype) x3 = np.random.random((3, 3)).astype(self.dtype) self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]} - y = x0 + x1 + x2 + x3 + # There will be a problem if just using `y=x0+x1+x2+x3` to calculate the + # summation result as the reference standard result. The reason is that + # numpy's fp16 data has precision loss when doing `add` operation. + # For example, the results of `x0+x1+x2+x3` is different from that of + # `x3+x2+x1+x0` if the dtype is fp16. + # Therefore, converting the input to fp32 for calculation. + y = (x0.astype(np.float32) + x1.astype(np.float32) + + x2.astype(np.float32) + x3.astype(np.float32)).astype(self.dtype) self.outputs = {'Out': y} self.attrs = {'use_mkldnn': False} -- GitLab