diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 2a8f47462345188c3870ca07119fe7687a1ebe9f..c65fa6340708a750247d561c046d4a4fccbd1bca 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -65,8 +65,9 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - int embedding_dim = table_grad_t->dims()[1]; + /* EmbeddingDenseGrad has bug on large shape, temporarily disable it. + int embedding_dim = table_grad_t->dims()[1]; if (embedding_dim % 32 == 0) { // NOTE(pangyoki): The embedding_dim of Tensor used in // EmbeddingDenseGrad must be an integer multiple of 32. @@ -77,19 +78,21 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { {"padding_idx", -1}, {"scale_grad_by_freq", false}}); runner.Run(stream); - } else { - const auto &runner_zeros = - NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); - runner_zeros.Run(stream); - - // NOTE(zhiqiu): It seems in cann 20.1, the first input and output - // can be different tensor, but in cann 20.2+, it does inplace operation. - // Thus, the first input and output should be same tensor. - const auto &runner_scatter = - NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, - {*table_grad_t}, {{"use_locking", true}}); - runner_scatter.Run(stream); + return; } + */ + + const auto &runner_zeros = + NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); + runner_zeros.Run(stream); + + // NOTE(zhiqiu): It seems in cann 20.1, the first input and output + // can be different tensor, but in cann 20.2+, it does inplace operation. + // Thus, the first input and output should be same tensor. + const auto &runner_scatter = + NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, + {*table_grad_t}, {{"use_locking", true}}); + runner_scatter.Run(stream); } }; } // namespace operators