[NPU] fix lookup_table_v2_grad ACL error for model BoW (#36864)

* [NPU] fix lookup_table_v2_grad ACL error for model BoW * add more unit tests

[NPU] fix lookup_table_v2_grad ACL error for model BoW (#36864)
* [NPU] fix lookup_table_v2_grad ACL error for model BoW * add more unit tests
792d3d76 · Aganlengzi · GitHub · 0a963ee9 · 792d3d76 · 792d3d76
2 changed file
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -101,6 +101,7 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
    auto stream =
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
+    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");

    /* EmbeddingDenseGrad has bug on large shape, temporarily disable it.

@@ -123,13 +124,34 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
    runner_zeros.Run(stream);

-    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-    // can be different tensor, but in cann 20.2+, it does inplace operation.
-    // Thus, the first input and output should be same tensor.
-    const auto &runner_scatter =
-        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
-                    {*table_grad_t}, {{"use_locking", true}});
-    runner_scatter.Run(stream);
+    if (padding_idx == kNoPadding) {
+      // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+      // can be different tensor, but in cann 20.2+, it does inplace operation.
+      // Thus, the first input and output should be same tensor.
+      const auto &runner_scatter =
+          NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                      {*table_grad_t}, {{"use_locking", true}});
+      runner_scatter.Run(stream);
+    } else {
+      Tensor casted_ids_t;
+      if (ids_t->type() != framework::proto::VarType::INT32) {
+        casted_ids_t.mutable_data<int32_t>(ids_t->dims(), ctx.GetPlace());
+        const auto &cast_runner = NpuOpRunner("Cast", {*ids_t}, {casted_ids_t},
+                                              {{"dst_type", ACL_INT32}});
+        cast_runner.Run(stream);
+      } else {
+        casted_ids_t.ShareDataWith(*ids_t);
+      }
+      auto table_grad_dims = table_grad_t->dims();
+
+      NpuOpRunner runner;
+      runner.SetType("UnsortedSegmentSum")
+          .AddInput(*output_grad_t)
+          .AddInput(casted_ids_t)
+          .AddInput(std::vector<int64_t>{table_grad_dims[0]})
+          .AddOutput(*table_grad_t);
+      runner.Run(stream);
+    }
  }
 };
 }  // namespace operators

--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -38,7 +38,7 @@ class TestLookupTableV2(OpTest):
        np.random.seed(SEED)
        w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
        x = np.random.randint(
-            0, self.vocab, size=(self.bsz, self.seqlen)).astype(np.int32)
+            0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype)
        out = w[x]
        if self.padding_idx != -1:
            out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
@@ -60,6 +60,7 @@ class TestLookupTableV2(OpTest):

    def init_dtype(self):
        self.dtype = np.float32
+        self.ids_dtype = np.int32

    def init_dims(self):
        self.bsz = 6
@@ -85,6 +86,7 @@ class TestLookupTableV2FP16(TestLookupTableV2):

    def init_dtype(self):
        self.dtype = np.float16
+        self.ids_dtype = np.int32

    def set_npu(self):
        self.__class__.use_npu = True
@@ -105,6 +107,7 @@ class TestLookupTableV2Dim32FP16(TestLookupTableV2):

    def init_dtype(self):
        self.dtype = np.float16
+        self.ids_dtype = np.int64

    def init_dims(self):
        self.bsz = 6
@@ -122,5 +125,14 @@ class TestLookupTableV2WithPadding(TestLookupTableV2):
        self.padding_idx = np.random.randint(0, self.vocab)


+class TestLookupTableV2WithPadding1(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.ids_dtype = np.int64
+
+
 if __name__ == '__main__':
    unittest.main()