[NPU] disable EmbeddingDenseGrad temporarily (#34498)

2ad1e4c7 · Leo Chen · GitHub · 87148a5c · 2ad1e4c7
隐藏空白更改
内联并排

Showing with 16 addition and 13 deletion

paddle/fluid/operators/lookup_table_v2_op_npu.cc paddle/fluid/operators/lookup_table_v2_op_npu.cc +16 -13

未找到文件。
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -65,8 +65,9 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
        ctx.template device_context<paddle::platform::NPUDeviceContext>()
            .stream();
-    int embedding_dim = table_grad_t->dims()[1];
+    /* EmbeddingDenseGrad has bug on large shape, temporarily disable it.
+    int embedding_dim = table_grad_t->dims()[1];
    if (embedding_dim % 32 == 0) {
      // NOTE(pangyoki): The embedding_dim of Tensor used in
      // EmbeddingDenseGrad must be an integer multiple of 32.
@@ -77,19 +78,21 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel<T> {
                                        {"padding_idx", -1},
                                        {"scale_grad_by_freq", false}});
      runner.Run(stream);
-    } else {
+      return;
-      const auto &runner_zeros =
-          NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
-      runner_zeros.Run(stream);
-      // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-      // can be different tensor, but in cann 20.2+, it does inplace operation.
-      // Thus, the first input and output should be same tensor.
-      const auto &runner_scatter =
-          NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
-                      {*table_grad_t}, {{"use_locking", true}});
-      runner_scatter.Run(stream);
    }
+    */
+    const auto &runner_zeros =
+        NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t});
+    runner_zeros.Run(stream);
+    // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+    // can be different tensor, but in cann 20.2+, it does inplace operation.
+    // Thus, the first input and output should be same tensor.
+    const auto &runner_scatter =
+        NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t},
+                    {*table_grad_t}, {{"use_locking", true}});
+    runner_scatter.Run(stream);
  }
 };
 }  // namespace operators