Add cpu npu cembedding (#35467)

ec252914 · Baibaifan · GitHub · 4f4962cb · ec252914 · ec252914
8 changed file
--- a/paddle/fluid/operators/collective/c_embedding_op.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op.cc
@@ -46,6 +46,17 @@ class CEmbeddingOp : public framework::OperatorWithKernel {
        framework::proto::VarType::LOD_TENSOR) {
      ctx->ShareLoD("Ids", /*->*/ "Out");
    }
+
+    // check valid
+    const int64_t height = table_dims[0];
+    const int64_t width = table_dims[1];
+    const int64_t start_idx = ctx->Attrs().Get<int64_t>("start_index");
+
+    PADDLE_ENFORCE_EQ(
+        (height > 0 && width > 0 && start_idx >= 0), true,
+        platform::errors::InvalidArgument(
+            "height:%ld width:%ld start_idx:%ld must not have negtive values",
+            height, width, start_idx));
  }

 protected:
@@ -63,7 +74,7 @@ class CEmbeddingOpMaker : public framework::OpProtoAndCheckerMaker {
             "(Tensor) The input represents embedding tensors, "
             "which is a learnable parameter.");
    AddInput("Ids",
-             "An input with type int64 "
+             "An input with type int32 or int64 in CPU and GPU, int32 in NPU "
             "contains the ids to be looked up in W.");
    AddOutput("Out", "The lookup results, which have the same type as W.");

@@ -111,6 +122,21 @@ class CEmbeddingOpGrad : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    auto table_dims = ctx->GetInputDim("W");
    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
+
+    // check valid
+    PADDLE_ENFORCE_EQ(table_dims.size(), 2,
+                      platform::errors::InvalidArgument(
+                          "Only accept the dims of table_t == 2"));
+
+    const int64_t start_idx = ctx->Attrs().Get<int64_t>("start_index");
+    const int64_t height = table_dims[0];
+    const int64_t width = table_dims[1];
+
+    PADDLE_ENFORCE_EQ(
+        (height > 0 && width > 0 && start_idx >= 0), true,
+        platform::errors::InvalidArgument(
+            "height:%ld width:%ld start_idx:%ld must not have negtive values",
+            height, width, start_idx));
  }

 protected:
@@ -137,6 +163,7 @@ class CEmbeddingOpGradVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle

 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OPERATOR(c_embedding, ops::CEmbeddingOp, ops::CEmbeddingOpMaker,
                  ops::CEmbeddingGradOpMaker<paddle::framework::OpDesc>,
                  ops::CEmbeddingGradOpMaker<paddle::imperative::OpBase>);
@@ -146,4 +173,9 @@ REGISTER_OPERATOR(c_embedding_grad, ops::CEmbeddingOpGrad,
                  ops::CEmbeddingOpGradVarTypeInference);

 REGISTER_OP_CPU_KERNEL(c_embedding, ops::CEmbeddingOpCPUKernel<float>,
-                       ops::CEmbeddingOpCPUKernel<double>);
+                       ops::CEmbeddingOpCPUKernel<double>,
+                       ops::CEmbeddingOpCPUKernel<plat::float16>);
+
+REGISTER_OP_CPU_KERNEL(c_embedding_grad, ops::CEmbeddingGradOpCPUKernel<float>,
+                       ops::CEmbeddingGradOpCPUKernel<double>,
+                       ops::CEmbeddingGradOpCPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -105,6 +105,9 @@ class CEmbeddingCUDAKernel : public framework::OpKernel<T> {
      CEmbedding<T, int64_t><<<blocks, threads, 0, dev_ctx.stream()>>>(
          output, table, ids_t->data<int64_t>(), K, D, N, start_idx, end_idx,
          limit);
+    } else {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "GPU c_embedding ids only support int32 or int64."));
    }
  }
 };

--- a/paddle/fluid/operators/collective/c_embedding_op.h
+++ b/paddle/fluid/operators/collective/c_embedding_op.h
@@ -27,12 +27,115 @@ namespace operators {

 using LoDTensor = framework::LoDTensor;

+inline void CheckTableValid() {}
+
+template <typename TIds, typename TData>
+void GetIdsEmbedding(const TIds* ids, size_t ids_len, int64_t start_idx,
+                     const TData* table, int64_t height, int64_t width,
+                     TData* out) {
+  for (size_t i = 0; i < ids_len; i++) {
+    TIds id = ids[i];
+    int64_t local = id - start_idx;
+
+    if (local >= 0 && local < height) {
+      // for (int64_t w = 0; w < width; w++) {
+      //   out[i * width + w] = table[local * width + w];
+      // }
+
+      memcpy(out + i * width, table + local * width, width * sizeof(TData));
+    } else {
+      memset(out + i * width, 0, width * sizeof(TData));
+    }
+  }
+}
+
 template <typename T>
 class CEmbeddingOpCPUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* table_t = ctx.Input<LoDTensor>("W");
+    auto* ids_t = ctx.Input<LoDTensor>("Ids");
+    auto* output_t = ctx.Output<LoDTensor>("Out");
+    const int64_t start_idx = ctx.Attr<int64_t>("start_index");
+
+    VLOG(10) << "table_dims:" << table_t->dims();
+
+    const T* table_data = table_t->data<T>();
+    T* output_data = output_t->mutable_data<T>(ctx.GetPlace());
+
+    const int64_t height = table_t->dims()[0];
+    const int64_t width = table_t->dims()[1];
+
+    const auto& index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      GetIdsEmbedding(ids_t->data<int32_t>(), ids_t->numel(), start_idx,
+                      table_data, height, width, output_data);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      GetIdsEmbedding(ids_t->data<int64_t>(), ids_t->numel(), start_idx,
+                      table_data, height, width, output_data);
+    } else {
      PADDLE_THROW(platform::errors::Unavailable(
-        "Do not support c_embedding for cpu kernel now."));
+          "CPU c_embedding ids only support int32 or int64."));
+    }
+  }
+};
+
+template <typename TIds, typename TData>
+void UpdateEmbedding(const TIds* ids, size_t ids_len, int64_t start_idx,
+                     TData* table, int64_t height, int64_t width,
+                     const TData* out) {
+  for (size_t i = 0; i < ids_len; i++) {
+    TIds id = ids[i];
+    int64_t local = id - start_idx;
+
+    if (local >= 0 && local < height) {
+      for (int64_t w = 0; w < width; w++) {
+        table[local * width + w] += out[i * width + w];
+      }
+    }
+  }
+}
+
+template <typename T>
+class CEmbeddingGradOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const int64_t start_idx = context.Attr<int64_t>("start_index");
+    auto ids_t = context.Input<LoDTensor>("Ids");
+    auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto table_t = context.Input<LoDTensor>("W");
+    auto table_grad_t = context.Output<LoDTensor>(framework::GradVarName("W"));
+
+    T* table_grad_data =
+        table_grad_t->mutable_data<T>(table_t->dims(), context.GetPlace());
+
+    size_t table_t_mem_size =
+        table_t->numel() * framework::SizeOfType(table_grad_t->type());
+    size_t table_grad_t_mem_size =
+        table_grad_t->numel() * framework::SizeOfType(table_grad_t->type());
+
+    VLOG(10) << "table_dims:" << table_t->dims()
+             << ", table_t memory_size:" << table_t_mem_size
+             << ", table_grad_t memory_size:" << table_grad_t_mem_size
+             << ", start_index:" << start_idx;
+
+    memset(table_grad_data, 0, table_grad_t_mem_size);
+    const T* d_output_data = d_output_t->data<T>();
+
+    const int64_t height = table_t->dims()[0];
+    const int64_t width = table_t->dims()[1];
+
+    const auto& index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      UpdateEmbedding(ids_t->data<int32_t>(), ids_t->numel(), start_idx,
+                      table_grad_data, height, width, d_output_data);
+    } else if (index_type == framework::proto::VarType::INT64) {
+      UpdateEmbedding(ids_t->data<int64_t>(), ids_t->numel(), start_idx,
+                      table_grad_data, height, width, d_output_data);
+    } else {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "CPU c_embedding ids only support int32 or int64."));
+    }
  }
 };


--- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/collective/c_embedding_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+#include "paddle/fluid/platform/npu_info.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+inline void FillNPU(Tensor *dst, T val,
+                    const framework::ExecutionContext &context) {
+  Tensor value(dst->type());
+  value.mutable_data<T>({1}, context.GetPlace());
+  FillNpuTensorWithConstant<T>(&value, static_cast<T>(val));
+
+  auto stream =
+      context.template device_context<paddle::platform::NPUDeviceContext>()
+          .stream();
+
+  const auto &runner = NpuOpRunner(
+      "FillD", {value}, {*dst}, {{"dims", framework::vectorize(dst->dims())}});
+  runner.Run(stream);
+}
+
+template <typename T>
+void shard_index(const Tensor &table_t, const Tensor &ids_t, int64_t start_idx,
+                 const Tensor &id_t,
+                 const framework::ExecutionContext &context) {
+  const int height = table_t.dims()[0];
+
+  auto stream =
+      context.template device_context<paddle::platform::NPUDeviceContext>()
+          .stream();
+  framework::Tensor id_t_d;
+  id_t_d.mutable_data<T>(ids_t.dims(), context.GetPlace());
+  FillNPU(&id_t_d, static_cast<T>(0.0), context);
+  id_t_d.Resize(ids_t.dims());
+
+  framework::Tensor id_t_u;
+  id_t_u.mutable_data<T>(ids_t.dims(), context.GetPlace());
+  FillNPU(&id_t_u, static_cast<T>(height - 1), context);
+  id_t_u.Resize(ids_t.dims());
+
+  framework::Tensor id_matched_d;
+  id_matched_d.mutable_data<bool>(ids_t.dims(), context.GetPlace());
+  framework::Tensor id_matched_u;
+  id_matched_u.mutable_data<bool>(ids_t.dims(), context.GetPlace());
+  framework::Tensor ignore_tensor;
+  ignore_tensor.mutable_data<T>(ids_t.dims(), context.GetPlace());
+  FillNPU(&ignore_tensor, static_cast<T>(height), context);
+  ignore_tensor.Resize(ids_t.dims());
+
+  NpuOpRunner sub_runner;
+  sub_runner.SetType("Sub")
+      .AddInput(ids_t)
+      .AddInput(std::vector<T>{static_cast<T>(start_idx)})
+      .AddOutput(id_t);
+  sub_runner.Run();
+
+  NpuOpRunner lessequal1_runner;
+  lessequal1_runner.SetType("LessEqual")
+      .AddInput(id_t)
+      .AddInput(id_t_u)
+      .AddOutput(id_matched_u);
+  lessequal1_runner.Run();
+
+  NpuOpRunner lessequal2_runner;
+  lessequal2_runner.SetType("LessEqual")
+      .AddInput(id_t_d)
+      .AddInput(id_t)
+      .AddOutput(id_matched_d);
+  lessequal2_runner.Run();
+
+  NpuOpRunner("Equal", {id_matched_u, id_matched_d}, {id_matched_d}, {})
+      .Run(stream);
+  NpuOpRunner("Select", {id_matched_d, id_t, ignore_tensor}, {id_t}, {})
+      .Run(stream);
+}
+
+template <typename TIds, typename T>
+void NPUGetIdsEmbedding(const framework::ExecutionContext &context) {
+  auto *table_t = context.Input<LoDTensor>("W");
+  auto *ids_t = context.Input<LoDTensor>("Ids");
+  auto *output_t = context.Output<LoDTensor>("Out");
+  const int64_t start_idx = context.Attr<int64_t>("start_index");
+
+  auto stream =
+      context.template device_context<paddle::platform::NPUDeviceContext>()
+          .stream();
+
+  framework::Tensor ids_t_local;
+  ids_t_local.mutable_data<TIds>(ids_t->dims(), context.GetPlace());
+  shard_index<TIds>(*table_t, *ids_t, start_idx, ids_t_local, context);
+
+  auto pad_shape =
+      framework::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]});
+  framework::LoDTensor table_t_pad;
+
+  size_t mem_size = table_t->numel() * framework::SizeOfType(table_t->type());
+  size_t line_mem_size =
+      table_t->dims()[1] * framework::SizeOfType(table_t->type());
+  PADDLE_ENFORCE_EQ(line_mem_size % 64, 0,
+                    platform::errors::InvalidArgument(
+                        "NPU only accept the second dim must align by 64"));
+
+  VLOG(10) << "mem_size:" << mem_size << ",line_mem_size:" << line_mem_size
+           << ", pad_shape:" << pad_shape << ", table_dims:" << table_t->dims();
+
+  uint8_t *pad_data = reinterpret_cast<uint8_t *>(
+      table_t_pad.mutable_data<T>(pad_shape, context.GetPlace()));
+  PADDLE_ENFORCE_NPU_SUCCESS(
+      aclrtMemcpyAsync(pad_data, mem_size, table_t->data<T>(), mem_size,
+                       ACL_MEMCPY_DEVICE_TO_DEVICE, stream));
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemsetAsync(
+      pad_data + mem_size, line_mem_size, 0, line_mem_size, stream));
+
+  output_t->mutable_data<T>(context.GetPlace());
+  NpuOpRunner runner;
+  runner.SetType("GatherV2")
+      .AddInput(table_t_pad)
+      .AddInput(ids_t_local)
+      .AddInput(std::vector<int32_t>{0})
+      .AddOutput(*output_t);
+  runner.Run();
+}
+
+template <typename T>
+class CEmbeddingNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+
+    const auto &index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      NPUGetIdsEmbedding<int32_t, T>(context);
+    } else {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "NPU c_embedding ids only support int32."));
+    }
+  }
+};
+
+template <typename TIds, typename T>
+void NPUUpdateEmbedding(const framework::ExecutionContext &context) {
+  // get inputs
+  const int64_t start_idx = context.Attr<int64_t>("start_index");
+  auto ids_t = context.Input<LoDTensor>("Ids");
+  auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
+  auto table_t = context.Input<Tensor>("W");
+  auto table_grad_t = context.Output<LoDTensor>(framework::GradVarName("W"));
+
+  VLOG(10) << "ids_t:" << ids_t << ", d_output_t:" << d_output_t
+           << ", table_t:" << table_t << ", table_grad_t" << table_grad_t;
+
+  auto stream =
+      context.template device_context<paddle::platform::NPUDeviceContext>()
+          .stream();
+
+  // convert ids_t to local valid
+  framework::Tensor ids_t_local;
+  ids_t_local.mutable_data<TIds>(ids_t->dims(), context.GetPlace());
+  shard_index<TIds>(*table_t, *ids_t, start_idx, ids_t_local, context);
+
+  // padding table_t -> table_t_pad
+  auto pad_shape =
+      framework::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]});
+  framework::LoDTensor table_t_pad;
+
+  // set table_t_pad to zero
+  uint8_t *pad_data = reinterpret_cast<uint8_t *>(
+      table_t_pad.mutable_data<T>(pad_shape, context.GetPlace()));
+  size_t table_t_pad_mem_size =
+      table_t_pad.numel() * framework::SizeOfType(table_t_pad.type());
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemsetAsync(pad_data, table_t_pad_mem_size, 0,
+                                              table_t_pad_mem_size, stream));
+
+  // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
+  // can be different tensor, but in cann 20.2+, it does inplace operation.
+  // Thus, the first input and output should be same tensor.
+  const auto &runner_scatter =
+      NpuOpRunner("ScatterAdd", {table_t_pad, ids_t_local, *d_output_t},
+                  {table_t_pad}, {{"use_locking", true}});
+  runner_scatter.Run(stream);
+
+  // copy table_t_pad to table_t
+  T *dst = table_grad_t->mutable_data<T>(table_t->dims(), context.GetPlace());
+  const size_t mem_size =
+      table_grad_t->numel() * framework::SizeOfType(table_grad_t->type());
+
+  // check align
+  size_t line_mem_size =
+      table_grad_t->dims()[1] * framework::SizeOfType(table_grad_t->type());
+  PADDLE_ENFORCE_EQ(line_mem_size % 64, 0,
+                    platform::errors::InvalidArgument(
+                        "NPU only accept the second dim must align by 64"));
+
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpyAsync(
+      dst, mem_size, pad_data, mem_size, ACL_MEMCPY_DEVICE_TO_DEVICE, stream));
+}
+
+template <typename T>
+class CEmbeddingGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *ids_t = context.Input<LoDTensor>("Ids");
+
+    const auto &index_type = ids_t->type();
+    if (index_type == framework::proto::VarType::INT32) {
+      NPUUpdateEmbedding<int32_t, T>(context);
+    } else {
+      PADDLE_THROW(
+          platform::errors::Unavailable("c_embedding ids only support int32."));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(c_embedding, ops::CEmbeddingNPUKernel<float>,
+                       ops::CEmbeddingNPUKernel<double>,
+                       ops::CEmbeddingNPUKernel<plat::float16>);
+REGISTER_OP_NPU_KERNEL(c_embedding_grad, ops::CEmbeddingGradNPUKernel<float>,
+                       ops::CEmbeddingGradNPUKernel<double>,
+                       ops::CEmbeddingGradNPUKernel<plat::float16>);
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -1230,65 +1230,6 @@ def _parallel_embedding(x,
    return out


-def _parallel_embedding_npu(x,
-                            per_part_embeddings,
-                            origin_size,
-                            param_attr,
-                            inner_rank,
-                            num_partitions,
-                            name,
-                            group=None):
-    """
-    NPU Parallel Embedding
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    origin_num_embeddings = origin_size[0]
-    embedding = paddle.nn.Embedding(
-        per_part_embeddings,
-        origin_size[1],
-        padding_idx=per_part_embeddings - 1,
-        sparse=False,
-        weight_attr=param_attr,
-        name=name)
-
-    origin_input_shape = x.shape
-    if len(origin_input_shape) == 2:
-        x = paddle.unsqueeze(x, axis=-1)
-    else:
-        assert origin_input_shape[-1] == 1, (
-            "The last dimension size of x must be 1.")
-    x_shard = paddle.shard_index(x, origin_num_embeddings, num_partitions,
-                                 inner_rank, per_part_embeddings - 1)
-    if len(origin_input_shape) == 2:
-        x_shard = paddle.squeeze(x_shard, axis=-1)
-    emb_out = embedding(x_shard)
-    startup_block = paddle.static.default_startup_program().global_block()
-    main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[embedding.weight.name].is_distributed = True
-    main_block.vars[embedding.weight.name].is_distributed = True
-    out = main_block.create_var(
-        shape=emb_out.shape,
-        dtype=emb_out.dtype,
-        type=emb_out.type,
-        lod_level=emb_out.lod_level,
-        persistable=False,
-        is_data=False,
-        need_check_feed=emb_out.desc.need_check_feed())
-    main_block.append_op(
-        type='c_allreduce_sum',
-        inputs={'X': emb_out},
-        outputs={'Out': out},
-        attrs={
-            'ring_id': ring_id,
-            'use_calc_stream': True,
-            'use_model_parallel': True
-        })
-    return out
-
-
 def split(x,
          size,
          operation,
@@ -1403,18 +1344,6 @@ def split(x,
            "but received vocabulary={} num_partitions={}".format(size[0], num_partitions)

        per_part_size = size[0] // num_partitions
-        if core.is_compiled_with_npu():
-            emb_out = _parallel_embedding_npu(
-                x,
-                per_part_size,
-                size,
-                weight_attr,
-                inner_rank,
-                num_partitions,
-                name,
-                group=None)
-            return emb_out
-        else:
        emb_out = _parallel_embedding(
            x,
            per_part_size,

--- a/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
+++ b/python/paddle/fluid/tests/unittests/c_embedding_op_base.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.framework import core
+
+SEED = 2021
+np.random.seed(SEED)
+
+
+def get_c_embedding(start, end, table, ids):
+    index = ids.flatten()
+    input_mask = (index < start) | (index >= end)
+    masked_input = index - start
+    masked_input[input_mask] = 0
+    output = table[masked_input]
+    output[input_mask] = 0.0
+    return output
+
+
+class TestCEmbeddingCPU(OpTest):
+    def setUp(self):
+        self.init_dtype()
+        self.initcase()
+        if core.is_compiled_with_npu():
+            self.__class__.use_npu = True
+        elif core.is_compiled_with_cuda():
+            self.__class__.exist_fp64_check_grad = True
+
+    def initcase(self):
+        self.op_type = "c_embedding"
+        table = np.random.random((17, 64)).astype(self.dtype)
+        ids = np.random.randint(
+            low=0, high=17 * 2, size=(2, 4)).astype(self.ids_dtype)
+        self.start_index = 10
+        self.end_index = self.start_index + 17
+
+        self.inputs = {'W': table, 'Ids': ids}
+        np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
+        self.outputs = {'Out': np_out.reshape((2, 4, 64))}
+        self.attrs = {'start_index': self.start_index}
+        if core.is_compiled_with_npu():
+            self.__class__.use_npu = True
+
+    def test_check_cpu(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_cpu_grad(self):
+        self.check_grad_with_place(core.CPUPlace(), ['W'], 'Out')
+
+    def init_dtype(self):
+        self.dtype = "float32"
+        self.ids_dtype = "int64"
+
+
+class TestCEmbeddingOpBase(TestCEmbeddingCPU):
+    def setUp(self):
+        self.init_dtype()
+        self.initcase()
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            self.check_output_with_place(core.CUDAPlace(0))
+        elif core.is_compiled_with_npu():
+            self.check_output_with_place(core.NPUPlace(0))
+
+    def test_check_grad(self):
+        if core.is_compiled_with_cuda():
+            self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
+        elif core.is_compiled_with_npu():
+            self.check_grad_with_place(core.NPUPlace(0), ['W'], 'Out')
+
+    def init_dtype(self):
+        if core.is_compiled_with_cuda():
+            self.dtype = "float64"
+            self.ids_dtype = "int64"
+        elif core.is_compiled_with_npu():
+            self.dtype = "float32"
+            self.ids_dtype = "int32"
+
+
+class TestCEmbeddingOpFP32(TestCEmbeddingOpBase):
+    def setUp(self):
+        self.init_dtype()
+        self.initcase()
+
+    def initcase(self):
+        self.op_type = "c_embedding"
+        table = np.random.random((17, 64)).astype(self.dtype)
+        ids = np.random.randint(
+            low=0, high=17 * 2, size=(2, 4)).astype(self.ids_dtype)
+        self.start_index = 10
+        ids[0][1] = 12
+        ids[0][2] = 12
+        ids[1][2] = 12
+        ids[1][3] = 12
+        self.end_index = self.start_index + 17
+
+        self.inputs = {'W': table, 'Ids': ids}
+        np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
+        self.outputs = {'Out': np_out.reshape((2, 4, 64))}
+        self.attrs = {'start_index': self.start_index}
+
+        if core.is_compiled_with_npu():
+            self.__class__.use_npu = True
+        elif core.is_compiled_with_cuda():
+            self.__class__.exist_fp64_check_grad = True
+
+    def init_dtype(self):
+        self.dtype = "float32"
+        self.ids_dtype = "int32"
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_c_embedding_op_npu.py
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import unittest
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.c_embedding_op_base import TestCEmbeddingCPU, TestCEmbeddingOpBase, TestCEmbeddingOpFP32
+
+paddle.enable_static()
+
+TestCEmbeddingCPU()
+
+TestCEmbeddingOpBase()
+
+TestCEmbeddingOpFP32()
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_c_embedding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_c_embedding_op.py
@@ -20,40 +20,13 @@ from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
 from paddle.framework import core
+from paddle.fluid.tests.unittests.c_embedding_op_base import TestCEmbeddingCPU, TestCEmbeddingOpBase, TestCEmbeddingOpFP32

+TestCEmbeddingCPU()

-def get_c_embedding(start, end, table, ids):
-    index = ids.flatten()
-    input_mask = (index < start) | (index >= end)
-    masked_input = index - start
-    masked_input[input_mask] = 0
-    output = table[masked_input]
-    output[input_mask] = 0.0
-    return output
-
-
-class TestCEmbeddingOp(OpTest):
-    def setUp(self):
-        self.op_type = "c_embedding"
-        table = np.random.random((17, 31)).astype("float64")
-        ids = np.random.randint(
-            low=0, high=17 * 2, size=(2, 4, 5)).astype("int32")
-        self.start_index = 10
-        self.end_index = self.start_index + 17
-
-        self.inputs = {'W': table, 'Ids': ids}
-        np_out = get_c_embedding(self.start_index, self.end_index, table, ids)
-        self.outputs = {'Out': np_out.reshape((2, 4, 5, 31))}
-        self.attrs = {'start_index': self.start_index}
-
-    def test_check_output_gpu(self):
-        if core.is_compiled_with_cuda():
-            self.check_output_with_place(core.CUDAPlace(0))
-
-    def test_check_grad_gpu(self):
-        if core.is_compiled_with_cuda():
-            self.check_grad_with_place(core.CUDAPlace(0), ['W'], 'Out')
+TestCEmbeddingOpBase()

+TestCEmbeddingOpFP32()

 if __name__ == "__main__":
    unittest.main()