From 02346930408c15037823c6eb3dc74bb4de8a0fed Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Sat, 13 Mar 2021 22:20:26 +0800
Subject: [PATCH] fix gather_grad bug (#31607)

---
 paddle/fluid/operators/gather_op_npu.cc       | 14 +++--
 .../tests/unittests/npu/test_gather_op_npu.py | 54 +++++++++++++++++++
 2 files changed, 60 insertions(+), 8 deletions(-)
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index cf0d9cda34..1d742b4076 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/kron_op.h"
 #include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/npu_info.h"
 
 namespace paddle {
 namespace operators {
@@ -65,20 +66,17 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
             .stream();
 
     // step2: ZerosLike x in device
-    Tensor *tmp_zerox = const_cast<Tensor *>(x);
     Tensor zeroslike_xout(x->type());
     zeroslike_xout.Resize(x->dims());
-    zeroslike_xout.mutable_data<T>(ctx.GetPlace());
+    auto p = zeroslike_xout.mutable_data<T>(ctx.GetPlace());
 
-    auto runner_zeroslike =
-        NpuOpRunner("ZerosLike", {*x}, {zeroslike_xout}, {});
-    runner_zeroslike.Run(stream);
-    tmp_zerox = &zeroslike_xout;
+    platform::NPUMemsetAsync(static_cast<void *>(p), 0,
+                             zeroslike_xout.numel() * sizeof(T), stream);
 
     // step3: scatter(x_grad)
     dx->mutable_data<T>(ctx.GetPlace());
-    auto runner_scatter = NpuOpRunner("TensorScatterUpdate",
-                                      {*tmp_zerox, *index, *dout}, {*dx}, {});
+    auto runner_scatter = NpuOpRunner(
+        "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {});
     runner_scatter.Run(stream);
   }
 };
diff --git a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
index 0fcb2bee65..8230be1c40 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gather_op_npu.py
@@ -23,6 +23,7 @@ import paddle
 import paddle.fluid as fluid
 
 paddle.enable_static()
+SEED = 2021
 
 
 @unittest.skipIf(not paddle.is_compiled_with_npu(),
@@ -105,5 +106,58 @@ class TestGatherAPI(unittest.TestCase):
         pass
 
 
+@unittest.skipIf(not paddle.is_compiled_with_npu(),
+                 "core is not compiled with NPU")
+class TestPowNet(unittest.TestCase):
+    def _test(self, run_npu=True):
+        main_prog = paddle.static.Program()
+        startup_prog = paddle.static.Program()
+        main_prog.random_seed = SEED
+        startup_prog.random_seed = SEED
+        np.random.seed(SEED)
+
+        a_np = np.random.random(size=(8192, 768)).astype('float32')
+        index_np = np.random.randint(0, 8192, size=(1232, 1)).astype('int32')
+
+        with paddle.static.program_guard(main_prog, startup_prog):
+            a = paddle.static.data(name="a", shape=[8192, 768], dtype='float32')
+            index = paddle.static.data(
+                name="index", shape=[1232, 1], dtype='int32')
+            a.stop_gradient = False
+            b = paddle.gather(a, index)
+
+            loss = fluid.layers.reduce_mean(b)
+            sgd = fluid.optimizer.SGD(learning_rate=0.01)
+            sgd.minimize(loss)
+
+        if run_npu:
+            place = paddle.NPUPlace(0)
+        else:
+            place = paddle.CPUPlace()
+
+        exe = paddle.static.Executor(place)
+        exe.run(startup_prog)
+
+        print("Start run on {}".format(place))
+        for epoch in range(100):
+
+            pred_res, loss_res = exe.run(main_prog,
+                                         feed={"a": a_np,
+                                               "index": index_np},
+                                         fetch_list=[b, loss])
+            if epoch % 10 == 0:
+                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                    epoch, pred_res[0], loss_res[0]))
+
+        return pred_res, loss_res
+
+    def test_npu(self):
+        npu_pred, npu_loss = self._test(True)
+        cpu_pred, cpu_loss = self._test(False)
+
+        self.assertTrue(np.allclose(npu_pred, cpu_pred))
+        self.assertTrue(np.allclose(npu_loss, cpu_loss))
+
+
 if __name__ == '__main__':
     unittest.main()
-- 
GitLab