From d746197398c9e1b298b1278b777c799e469e42b6 Mon Sep 17 00:00:00 2001
From: xiayanming <41795079@qq.com>
Date: Fri, 12 Mar 2021 14:15:42 +0800
Subject: [PATCH] [NPU] Support npu kernel for gather op fix bug (#31541)

* add gather npu op

* code review done

* update python new line

* precommit

* fix review

* del commit

* update gather_grad

* fix bug

* fix bug
---
 paddle/fluid/operators/gather_op_npu.cc      |  7 +++++--
 paddle/fluid/operators/gather_op_npu_test.cc | 10 +++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc
index 2d7b5b93ad..cf0d9cda34 100644
--- a/paddle/fluid/operators/gather_op_npu.cc
+++ b/paddle/fluid/operators/gather_op_npu.cc
@@ -51,10 +51,13 @@ class GatherGradOpNPUKernel : public framework::OpKernel<T> {
     auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     // step1: Unsqueeze index
+    framework::Tensor tmp_tensor(index->type());
     const auto index_dims = index->dims();
     if (index_dims.size() == 1) {
-      framework::Tensor tmp_index = UnsqueezeTo(*index, 2);
-      index = &tmp_index;
+      tmp_tensor.ShareDataWith(*index);
+      std::vector<int64_t> new_dim = {index_dims[0], 1};
+      tmp_tensor.Resize(framework::make_ddim(new_dim));
+      index = &tmp_tensor;
     }
 
     auto stream =
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index 4cd46da6f2..de067e4558 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -109,17 +109,17 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
   auto dout = scope->Var("DOut");
   auto tensor_dout = dout->GetMutable<f::LoDTensor>();
 
-  std::vector<int> init_index = {0, 1, 2, 0};
+  std::vector<int> init_index = {0, 1};
   paddle::framework::TensorFromVector<int>(init_index, ctx, tensor_index);
-  tensor_index->Resize(paddle::framework::make_ddim({2, 2}));
+  tensor_index->Resize(paddle::framework::make_ddim({2}));
 
   std::vector<T> init_x = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
   TensorFromVector(init_x, ctx, tensor_x);
   tensor_x->Resize(paddle::framework::make_ddim({3, 2}));
 
-  std::vector<T> init_dout = {5.0, 10.0};
+  std::vector<T> init_dout = {5.0, 10.0, 2.0, 3.0};
   TensorFromVector(init_dout, ctx, tensor_dout);
-  tensor_dout->Resize(paddle::framework::make_ddim({2}));
+  tensor_dout->Resize(paddle::framework::make_ddim({2, 2}));
 
   ctx.Wait();
 
@@ -143,7 +143,7 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
   uint32_t expected_size = 3 * 2;
   EXPECT_EQ((uint32_t)dx_vec.size(), expected_size);
 
-  std::vector<T> expected_dx_vec = {0.0, 5.0, 0.0, 0.0, 10.0, 0.0};
+  std::vector<T> expected_dx_vec = {5.0, 10.0, 2.0, 3.0, 0.0, 0.0};
   for (uint32_t i = 0; i < dx_vec.size(); i++) {
     VLOG(3) << "dx_vec[i]=" << dx_vec[i];
     EXPECT_EQ(dx_vec[i], expected_dx_vec[i]);
-- 
GitLab