diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
index 505e48054e75689e95b03e9ceb82de6fdd9a529d..e2cd7ca353ccfd64edd68c83208aa0ee2e459d24 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op_npu.cc
@@ -83,6 +83,11 @@ class ReduceSumGradNPUKernel : public framework::OpKernel<T> {
       Tensor out_grad_tmp(out_grad->type());
       out_grad_tmp.Resize(out_dims);
       out_grad_tmp.mutable_data<T>(ctx.GetPlace());
+      framework::TensorCopy(
+          *out_grad, ctx.GetPlace(),
+          ctx.template device_context<platform::DeviceContext>(),
+          &out_grad_tmp);
+      out_grad_tmp.Resize(out_dims);
 
       auto runner = NpuOpRunner("BroadcastToD", {out_grad_tmp}, {*x_grad},
                                 {{"shape", framework::vectorize(x->dims())}});
diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
index caae0507c2645dda6faaeab52a07d7134721cb98..dea5141a024c84655d672eb44c2a2c4e9b9c0501 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_sum_op_npu.py
@@ -102,7 +102,9 @@ class TestReduceSumNet(unittest.TestCase):
             label = paddle.static.data(
                 name="label", shape=[2, 1], dtype='int64')
 
-            z = paddle.add(a, b)
+            a_1 = fluid.layers.fc(input=a, size=4, num_flatten_dims=2, act=None)
+            b_1 = fluid.layers.fc(input=b, size=4, num_flatten_dims=2, act=None)
+            z = paddle.add(a_1, b_1)
             z_1 = self.set_reduce_sum_function(z)
 
             prediction = fluid.layers.fc(input=z_1, size=2, act='softmax')