【NPU】Add TensorCopy to NPU kernel for reduce_sum op (#31667)

* update unittest * add TensorCopy in npu grad kernel

【NPU】Add TensorCopy to NPU kernel for reduce_sum op (#31667)
* update unittest * add TensorCopy in npu grad kernel
444c2852 · pangyoki · GitHub · 8f08f160 · 444c2852 · 444c2852
2 changed file
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -83,6 +83,11 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
      Tensor out_grad_tmp(out_grad->type());
      out_grad_tmp.Resize(out_dims);
      out_grad_tmp.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *out_grad, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(),
+          &out_grad_tmp);
+      out_grad_tmp.Resize(out_dims);

      auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
                                {{"shape", framework::vectorize(x->dims())}});

--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
@@ -102,7 +102,9 @@ class TestReduceSumNet(unittest.TestCase):
            label = paddle.static.data(
                name="label", shape=[2, 1], dtype='int64')

-            z = paddle.add(a, b)
+            a_1 = fluid.layers.fc(input=a, size=4, num_flatten_dims=2, act=None)
+            b_1 = fluid.layers.fc(input=b, size=4, num_flatten_dims=2, act=None)
+            z = paddle.add(a_1, b_1)
            z_1 = self.set_reduce_sum_function(z)

            prediction = fluid.layers.fc(input=z_1, size=2, act='softmax')