[NPU] change Add to AddN in sum npu op (#33957)

* change Add to AddN in sum npu op * add AddInputNames * change fp16 to fp32 because numpy has accuracy loss in fp16 adding * delete check * fix runner error

[NPU] change Add to AddN in sum npu op (#33957)
* change Add to AddN in sum npu op * add AddInputNames * change fp16 to fp32 because numpy has accuracy loss in fp16 adding * delete check * fix runner error
fa5ddfd9 · pangyoki · GitHub · a84e48b9 · fa5ddfd9 · fa5ddfd9
Showing with 21 addition and 9 deletion

paddle/fluid/operators/sum_op_npu.cc paddle/fluid/operators/sum_op_npu.cc +13 -8

python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py +8 -1

未找到文件。
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -35,23 +35,28 @@ class SumNPUKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();

    int n = static_cast<int>(x.size());
-
    if (n == 1) {
      TensorCopy(*x[0], place, out);
      return;
    }

+    std::vector<framework::Tensor> inputs;
+    std::vector<std::string> names;
+    for (int i = 0; i < n; ++i) {
+      if (x[i] && x[i]->numel() > 0) {
+        inputs.push_back(*x[i]);
+        names.push_back("x" + std::to_string(i));
+      } else {
+        continue;
+      }
+    }
+
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-
-    const auto& runner = NpuOpRunner("Add", {*x[0], *x[1]}, {*out}, {});
-
+    NpuOpRunner runner{"AddN", {inputs}, {*out}, {{"N", n}}};
+    runner.AddInputNames(names);
    runner.Run(stream);
-    for (int i = 2; i < n; i++) {
-      const auto& runner1 = NpuOpRunner("Add", {*out, *x[i]}, {*out}, {});
-      runner1.Run(stream);
-    }
  }
 };


--- a/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_sum_op_npu.py
@@ -67,7 +67,14 @@ class TestSum2(OpTest):
        x2 = np.random.random((3, 3)).astype(self.dtype)
        x3 = np.random.random((3, 3)).astype(self.dtype)
        self.inputs = {'X': [("x0", x0), ("x1", x1), ("x2", x2), ("x3", x3)]}
-        y = x0 + x1 + x2 + x3
+        # There will be a problem if just using `y=x0+x1+x2+x3` to calculate the
+        # summation result as the reference standard result. The reason is that 
+        # numpy's fp16 data has precision loss when doing `add` operation.
+        # For example, the results of `x0+x1+x2+x3` is different from that of
+        # `x3+x2+x1+x0` if the dtype is fp16.
+        # Therefore, converting the input to fp32 for calculation.
+        y = (x0.astype(np.float32) + x1.astype(np.float32) +
+             x2.astype(np.float32) + x3.astype(np.float32)).astype(self.dtype)
        self.outputs = {'Out': y}

        self.attrs = {'use_mkldnn': False}