From fa5ddfd9511a550010a373e632b876fcc36673b3 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Mon, 5 Jul 2021 17:17:57 +0800
Subject: [PATCH] [NPU] change Add to AddN in sum npu op (#33957)

* change Add to AddN in sum npu op

* add AddInputNames

* change fp16 to fp32 because numpy has accuracy loss in fp16 adding

* delete check

* fix runner error
---
 paddle/fluid/operators/sum_op_npu.cc          | 21 ++++++++++++-------
 .../tests/unittests/npu/test_sum_op_npu.py    |  9 +++++++-
 2 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
index cbeb6285b6..a6032236c0 100644
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -35,23 +35,28 @@ class SumNPUKernel : public framework::OpKernel<T> {
     auto place = ctx.GetPlace();
 
     int n = static_cast<int>(x.size());
-
     if (n == 1) {
       TensorCopy(*x[0], place, out);
       return;
     }
 
+    std::vector<framework::Tensor> inputs;
+    std::vector<std::string> names;
+    for (int i = 0; i < n; ++i) {
+      if (x[i] && x[i]->numel() > 0) {
+        inputs.push_back(*x[i]);
+        names.push_back("x" + std::to_string(i));
+      } else {
+        continue;
+      }
+    }
+
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
-
-    const auto& runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
-
+    NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}};
+    runner.AddInputNames(names);
     runner.Run(stream);
-    for (int i = 2; i < n; i++) {
-      const auto& runner1 = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
-      runner1.Run(stream);
-    }
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
index 2ad6cc388f..21b42814c0 100755
--- a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
@@ -67,7 +67,14 @@ class TestSum2(OpTest):
         x2 = np.random.random((3, 3)).astype(self.dtype)
         x3 = np.random.random((3, 3)).astype(self.dtype)
         self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]}
-        y = x0 + x1 + x2 + x3
+        # There will be a problem if just using `y=x0+x1+x2+x3` to calculate the
+        # summation result as the reference standard result. The reason is that 
+        # numpy's fp16 data has precision loss when doing `add` operation.
+        # For example, the results of `x0+x1+x2+x3` is different from that of
+        # `x3+x2+x1+x0` if the dtype is fp16.
+        # Therefore, converting the input to fp32 for calculation.
+        y = (x0.astype(np.float32) + x1.astype(np.float32) +
+             x2.astype(np.float32) + x3.astype(np.float32)).astype(self.dtype)
         self.outputs = {'Out': y}
 
         self.attrs = {'use_mkldnn': False}
-- 
GitLab