diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 3cb91c712335d6502ae637cc6b281606232ae368..a6fd7e5c7a97d36f49b41fc07519e0a549c81bce 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -101,6 +101,7 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context() .stream(); + int64_t padding_idx = ctx.Attr("padding_idx"); /* EmbeddingDenseGrad has bug on large shape, temporarily disable it. @@ -123,13 +124,34 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); runner_zeros.Run(stream); - // NOTE(zhiqiu): It seems in cann 20.1, the first input and output - // can be different tensor, but in cann 20.2+, it does inplace operation. - // Thus, the first input and output should be same tensor. - const auto &runner_scatter = - NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, - {*table_grad_t}, {{"use_locking", true}}); - runner_scatter.Run(stream); + if (padding_idx == kNoPadding) { + // NOTE(zhiqiu): It seems in cann 20.1, the first input and output + // can be different tensor, but in cann 20.2+, it does inplace operation. + // Thus, the first input and output should be same tensor. + const auto &runner_scatter = + NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, + {*table_grad_t}, {{"use_locking", true}}); + runner_scatter.Run(stream); + } else { + Tensor casted_ids_t; + if (ids_t->type() != framework::proto::VarType::INT32) { + casted_ids_t.mutable_data(ids_t->dims(), ctx.GetPlace()); + const auto &cast_runner = NpuOpRunner("Cast", {*ids_t}, {casted_ids_t}, + {{"dst_type", ACL_INT32}}); + cast_runner.Run(stream); + } else { + casted_ids_t.ShareDataWith(*ids_t); + } + auto table_grad_dims = table_grad_t->dims(); + + NpuOpRunner runner; + runner.SetType("UnsortedSegmentSum") + .AddInput(*output_grad_t) + .AddInput(casted_ids_t) + .AddInput(std::vector{table_grad_dims[0]}) + .AddOutput(*table_grad_t); + runner.Run(stream); + } } }; } // namespace operators diff --git a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py index 1031be4c1a7b4169d81b1f2363b96b716f8d109a..fefff0974ae40d2b9ac9d1a5f81410283cef0761 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_lookup_table_v2_op_npu.py @@ -38,7 +38,7 @@ class TestLookupTableV2(OpTest): np.random.seed(SEED) w = np.random.random([self.vocab, self.dim]).astype(self.dtype) x = np.random.randint( - 0, self.vocab, size=(self.bsz, self.seqlen)).astype(np.int32) + 0, self.vocab, size=(self.bsz, self.seqlen)).astype(self.ids_dtype) out = w[x] if self.padding_idx != -1: out[np.squeeze(x == self.padding_idx)] = np.zeros(self.dim) @@ -60,6 +60,7 @@ class TestLookupTableV2(OpTest): def init_dtype(self): self.dtype = np.float32 + self.ids_dtype = np.int32 def init_dims(self): self.bsz = 6 @@ -85,6 +86,7 @@ class TestLookupTableV2FP16(TestLookupTableV2): def init_dtype(self): self.dtype = np.float16 + self.ids_dtype = np.int32 def set_npu(self): self.__class__.use_npu = True @@ -105,6 +107,7 @@ class TestLookupTableV2Dim32FP16(TestLookupTableV2): def init_dtype(self): self.dtype = np.float16 + self.ids_dtype = np.int64 def init_dims(self): self.bsz = 6 @@ -122,5 +125,14 @@ class TestLookupTableV2WithPadding(TestLookupTableV2): self.padding_idx = np.random.randint(0, self.vocab) +class TestLookupTableV2WithPadding1(TestLookupTableV2): + def init_padding_idx(self): + self.padding_idx = np.random.randint(0, self.vocab) + + def init_dtype(self): + self.dtype = np.float32 + self.ids_dtype = np.int64 + + if __name__ == '__main__': unittest.main()