diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
index 3cb91c712335d6502ae637cc6b281606232ae368..a6fd7e5c7a97d36f49b41fc07519e0a549c81bce 100644
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -101,6 +101,7 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
+    int64_t padding_idx = ctx.Attr<int64_t>("padding_idx");
 
     /* EmbeddingDenseGrad has bug on large shape, temporarily disable it.
 
@@ -123,13 +124,34 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
         NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
     runner_zeros.Run(stream);
 
-    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-    // can be different tensor, but in cann 20.2+, it does inplace operation.
-    // Thus, the first input and output should be same tensor.
-    const auto &runner_scatter =
-        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
-                    {*table_grad_t}, {{"use_locking", true}});
-    runner_scatter.Run(stream);
+    if (padding_idx == kNoPadding) {
+      // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+      // can be different tensor, but in cann 20.2+, it does inplace operation.
+      // Thus, the first input and output should be same tensor.
+      const auto &runner_scatter =
+          NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                      {*table_grad_t}, {{"use_locking", true}});
+      runner_scatter.Run(stream);
+    } else {
+      Tensor casted_ids_t;
+      if (ids_t->type() != framework::proto::VarType::INT32) {
+        casted_ids_t.mutable_data<int32_t>(ids_t->dims(), ctx.GetPlace());
+        const auto &cast_runner = NpuOpRunner("Cast", {*ids_t}, {casted_ids_t},
+                                              {{"dst_type", ACL_INT32}});
+        cast_runner.Run(stream);
+      } else {
+        casted_ids_t.ShareDataWith(*ids_t);
+      }
+      auto table_grad_dims = table_grad_t->dims();
+
+      NpuOpRunner runner;
+      runner.SetType("UnsortedSegmentSum")
+          .AddInput(*output_grad_t)
+          .AddInput(casted_ids_t)
+          .AddInput(std::vector<int64_t>{table_grad_dims[0]})
+          .AddOutput(*table_grad_t);
+      runner.Run(stream);
+    }
   }
 };
 }  // namespace operators
diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
index 1031be4c1a7b4169d81b1f2363b96b716f8d109a..fefff0974ae40d2b9ac9d1a5f81410283cef0761 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py
@@ -38,7 +38,7 @@ class TestLookupTableV2(OpTest):
         np.random.seed(SEED)
         w = np.random.random([self.vocab, self.dim]).astype(self.dtype)
         x = np.random.randint(
-            0, self.vocab, size=(self.bsz, self.seqlen)).astype(np.int32)
+            0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype)
         out = w[x]
         if self.padding_idx != -1:
             out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim)
@@ -60,6 +60,7 @@ class TestLookupTableV2(OpTest):
 
     def init_dtype(self):
         self.dtype = np.float32
+        self.ids_dtype = np.int32
 
     def init_dims(self):
         self.bsz = 6
@@ -85,6 +86,7 @@ class TestLookupTableV2FP16(TestLookupTableV2):
 
     def init_dtype(self):
         self.dtype = np.float16
+        self.ids_dtype = np.int32
 
     def set_npu(self):
         self.__class__.use_npu = True
@@ -105,6 +107,7 @@ class TestLookupTableV2Dim32FP16(TestLookupTableV2):
 
     def init_dtype(self):
         self.dtype = np.float16
+        self.ids_dtype = np.int64
 
     def init_dims(self):
         self.bsz = 6
@@ -122,5 +125,14 @@ class TestLookupTableV2WithPadding(TestLookupTableV2):
         self.padding_idx = np.random.randint(0, self.vocab)
 
 
+class TestLookupTableV2WithPadding1(TestLookupTableV2):
+    def init_padding_idx(self):
+        self.padding_idx = np.random.randint(0, self.vocab)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+        self.ids_dtype = np.int64
+
+
 if __name__ == '__main__':
     unittest.main()