delete paddle/fluid/operators/collective/*_npu.* (#52677)

delete paddle/fluid/operators/collective/_npu. (#52677)
b451aff8 · jjyaoao · GitHub · c1cad896 · c1cad896 · c1cad896
30 changed file
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_allgather,
-                       ops::CAllGatherOpASCENDKernel<int8_t>,
-                       ops::CAllGatherOpASCENDKernel<int>,
-                       ops::CAllGatherOpASCENDKernel<int64_t>,
-                       ops::CAllGatherOpASCENDKernel<float>,
-                       ops::CAllGatherOpASCENDKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(c_allgather);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allgather, NPU);
-DECLARE_string(selected_npus);
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-  int num1 = 1;
-  int num2 = 4;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-  attrs["nranks"] = 2;
-  auto op = f::OpRegistry::CreateOp(
-      "c_allgather", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size() * 2);
-  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
-    EXPECT_EQ(out_vec[i], 1.0);
-  }
-  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 2.0);
-  }
-}
-TEST(c_allgather, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLAllGatherOp(&scope, ctx);
-}
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    c_allreduce_max,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(c_allreduce_max);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
-DECLARE_string(selected_npus);
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-  int num1 = 100;
-  int num2 = 100;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id * 3);
-  }
-  PrintDebugInfo("input data", init);
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-  auto op = f::OpRegistry::CreateOp(
-      "c_allreduce_max", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 4.0);
-  }
-}
-TEST(c_allreduce_max, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLAllReduceOp(&scope, ctx);
-}
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    c_allreduce_min,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    c_allreduce_prod,
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    c_allreduce_sum,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
-// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
-// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
-// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(c_allreduce_sum);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
-DECLARE_string(selected_npus);
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  std::cout << preStr << ":" << std::endl << debugstring;
-  for (auto ele : data) {
-    std::cout << ele << " ";
-  }
-  std::cout << std::endl;
-}
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-template <typename T>
-void TestHCCLAllReduceOp(f::Scope* scope,
-                         const p::DeviceContext& ctx,
-                         int iter) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  int rank_id = atoi(getenv("RANK_ID"));
-  int num1 = 3;
-  int num2 = 128;
-  std::vector<T> init;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(static_cast<T>(1.0 + rank_id));
-  }
-  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
-  PrintDebugInfo("input data", init);
-  auto place = ctx.GetPlace();
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<T>(place);  // allocate
-  ctx.Wait();
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
-  attrs["ring_id"] = 0;
-  attrs["use_calc_stream"] = 1;
-  auto op = f::OpRegistry::CreateOp(
-      "c_allreduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  for (int i = 0; i < 1; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  PrintDebugInfo("output data", out_vec);
-  float diff = static_cast<float>(out_vec[0]) - 65504;
-  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 1; i < 10; i++) {
-    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
-  }
-}
-TEST(c_allreduce_sum, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  // only support one device, if more than one device, use first default
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
-  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
-}
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_broadcast,
-                       ops::CBroadcastOpASCENDKernel<int>,
-                       ops::CBroadcastOpASCENDKernel<int8_t>,
-                       ops::CBroadcastOpASCENDKernel<float>,
-                       ops::CBroadcastOpASCENDKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(c_broadcast);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
-DECLARE_string(selected_npus);
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  int num = 2;
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-  for (int64_t i = 0; i < num * num; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["root"] = 0;
-  attrs["ring_id"] = 0;
-  auto op = f::OpRegistry::CreateOp(
-      "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 1.0);
-  }
-}
-TEST(c_broadcast, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLBroadcastOp(&scope, ctx);
-}
--- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include <string>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/collective/c_embedding_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-inline void FillNPU(Tensor *dst,
-                    T val,
-                    const framework::ExecutionContext &context) {
-  Tensor value(dst->type());
-  value.mutable_data<T>({1}, context.GetPlace());
-  FillNpuTensorWithConstant<T>(&value, static_cast<T>(val));
-  auto stream =
-      context.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  const auto &runner = NpuOpRunner(
-      "FillD", {value}, {*dst}, {{"dims", phi::vectorize(dst->dims())}});
-  runner.Run(stream);
-}
-template <typename T>
-void shard_index(const Tensor &table_t,
-                 const Tensor &ids_t,
-                 int64_t start_idx,
-                 const Tensor &id_t,
-                 const framework::ExecutionContext &context) {
-  const int height = table_t.dims()[0];
-  auto stream =
-      context.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  phi::DenseTensor id_t_d;
-  id_t_d.mutable_data<T>(ids_t.dims(), context.GetPlace());
-  FillNPU(&id_t_d, static_cast<T>(0.0), context);
-  id_t_d.Resize(ids_t.dims());
-  phi::DenseTensor id_t_u;
-  id_t_u.mutable_data<T>(ids_t.dims(), context.GetPlace());
-  FillNPU(&id_t_u, static_cast<T>(height - 1), context);
-  id_t_u.Resize(ids_t.dims());
-  phi::DenseTensor id_matched_d;
-  id_matched_d.mutable_data<bool>(ids_t.dims(), context.GetPlace());
-  phi::DenseTensor id_matched_u;
-  id_matched_u.mutable_data<bool>(ids_t.dims(), context.GetPlace());
-  phi::DenseTensor ignore_tensor;
-  ignore_tensor.mutable_data<T>(ids_t.dims(), context.GetPlace());
-  FillNPU(&ignore_tensor, static_cast<T>(height), context);
-  ignore_tensor.Resize(ids_t.dims());
-  NpuOpRunner sub_runner;
-#if (CANN_VERSION_CODE >= 503003)
-  Tensor factor_tensor(ids_t.type());
-  factor_tensor.mutable_data<T>({1}, context.GetPlace());
-  paddle::framework::TensorFromVector(std::vector<T>{static_cast<T>(start_idx)},
-                                      context.device_context(),
-                                      &factor_tensor);
-  sub_runner.SetType("Sub")
-      .AddInput(ids_t)
-      .AddInput(factor_tensor)
-      .AddOutput(id_t);
-#else
-  sub_runner.SetType("Sub")
-      .AddInput(ids_t)
-      .AddInput(std::vector<T>{static_cast<T>(start_idx)})
-      .AddOutput(id_t);
-#endif
-  sub_runner.Run();
-  NpuOpRunner lessequal1_runner;
-  lessequal1_runner.SetType("LessEqual")
-      .AddInput(id_t)
-      .AddInput(id_t_u)
-      .AddOutput(id_matched_u);
-  lessequal1_runner.Run();
-  NpuOpRunner lessequal2_runner;
-  lessequal2_runner.SetType("LessEqual")
-      .AddInput(id_t_d)
-      .AddInput(id_t)
-      .AddOutput(id_matched_d);
-  lessequal2_runner.Run();
-  NpuOpRunner("Equal", {id_matched_u, id_matched_d}, {id_matched_d}, {})
-      .Run(stream);
-  NpuOpRunner("Select", {id_matched_d, id_t, ignore_tensor}, {id_t}, {})
-      .Run(stream);
-}
-template <typename TIds, typename T>
-void NPUGetIdsEmbedding(const framework::ExecutionContext &context) {
-  auto *table_t = context.Input<phi::DenseTensor>("W");
-  auto *ids_t = context.Input<phi::DenseTensor>("Ids");
-  auto *output_t = context.Output<phi::DenseTensor>("Out");
-  const int64_t start_idx = context.Attr<int64_t>("start_index");
-  auto stream =
-      context.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  phi::DenseTensor ids_t_local;
-  ids_t_local.mutable_data<TIds>(ids_t->dims(), context.GetPlace());
-  shard_index<TIds>(*table_t, *ids_t, start_idx, ids_t_local, context);
-  auto pad_shape = phi::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]});
-  phi::DenseTensor table_t_pad;
-  size_t mem_size = table_t->numel() * phi::SizeOf(table_t->dtype());
-  size_t line_mem_size = table_t->dims()[1] * phi::SizeOf(table_t->dtype());
-  PADDLE_ENFORCE_EQ(line_mem_size % 64,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "NPU only accept the second dim must align by 64"));
-  VLOG(10) << "mem_size:" << mem_size << ",line_mem_size:" << line_mem_size
-           << ", pad_shape:" << pad_shape << ", table_dims:" << table_t->dims();
-  uint8_t *pad_data = reinterpret_cast<uint8_t *>(
-      table_t_pad.mutable_data<T>(pad_shape, context.GetPlace()));
-  platform::NPUMemcpyAsync(pad_data,
-                           table_t->data<T>(),
-                           mem_size,
-                           ACL_MEMCPY_DEVICE_TO_DEVICE,
-                           stream,
-                           mem_size);
-  platform::NPUMemsetAsync(
-      pad_data + mem_size, 0, line_mem_size, stream, line_mem_size);
-  output_t->mutable_data<T>(context.GetPlace());
-  NpuOpRunner runner;
-  runner.SetType("GatherV2")
-      .AddInput(table_t_pad)
-      .AddInput(ids_t_local)
-      .AddInput(std::vector<int32_t>{0})
-#if (CANN_VERSION_CODE >= 503003)
-      .AddAttrs({{"batch_dims", 0}})
-#endif
-      .AddOutput(*output_t);
-  runner.Run();
-}
-template <typename T>
-class CEmbeddingNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *ids_t = context.Input<phi::DenseTensor>("Ids");
-    const auto &index_type = framework::TransToProtoVarType(ids_t->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      NPUGetIdsEmbedding<int32_t, T>(context);
-    } else {
-      PADDLE_THROW(platform::errors::Unavailable(
-          "NPU c_embedding ids only support int32."));
-    }
-  }
-};
-template <typename TIds, typename T>
-void NPUUpdateEmbedding(const framework::ExecutionContext &context) {
-  // get inputs
-  const int64_t start_idx = context.Attr<int64_t>("start_index");
-  auto ids_t = context.Input<phi::DenseTensor>("Ids");
-  auto d_output_t =
-      context.Input<phi::DenseTensor>(framework::GradVarName("Out"));
-  auto table_t = context.Input<phi::DenseTensor>("W");
-  auto table_grad_t =
-      context.Output<phi::DenseTensor>(framework::GradVarName("W"));
-  VLOG(10) << "ids_t:" << ids_t << ", d_output_t:" << d_output_t
-           << ", table_t:" << table_t << ", table_grad_t" << table_grad_t;
-  auto stream =
-      context.template device_context<paddle::platform::NPUDeviceContext>()
-          .stream();
-  // convert ids_t to local valid
-  phi::DenseTensor ids_t_local;
-  ids_t_local.mutable_data<TIds>(ids_t->dims(), context.GetPlace());
-  shard_index<TIds>(*table_t, *ids_t, start_idx, ids_t_local, context);
-  // padding table_t -> table_t_pad
-  auto pad_shape = phi::make_ddim({table_t->dims()[0] + 1, table_t->dims()[1]});
-  phi::DenseTensor table_t_pad;
-  // set table_t_pad to zero
-  uint8_t *pad_data = reinterpret_cast<uint8_t *>(
-      table_t_pad.mutable_data<T>(pad_shape, context.GetPlace()));
-  size_t table_t_pad_mem_size =
-      table_t_pad.numel() *
-      framework::SizeOfType(
-          framework::TransToProtoVarType(table_t_pad.dtype()));
-  platform::NPUMemsetAsync(
-      pad_data, 0, table_t_pad_mem_size, stream, table_t_pad_mem_size);
-  // NOTE(zhiqiu): It seems in cann 20.1, the first input and output
-  // can be different tensor, but in cann 20.2+, it does inplace operation.
-  // Thus, the first input and output should be same tensor.
-  const auto &runner_scatter =
-      NpuOpRunner("ScatterAdd",
-                  {table_t_pad, ids_t_local, *d_output_t},
-                  {table_t_pad},
-                  {{"use_locking", true}});
-  runner_scatter.Run(stream);
-  // copy table_t_pad to table_t
-  T *dst = table_grad_t->mutable_data<T>(table_t->dims(), context.GetPlace());
-  const size_t mem_size =
-      table_grad_t->numel() * phi::SizeOf(table_grad_t->dtype());
-  // check align
-  size_t line_mem_size =
-      table_grad_t->dims()[1] * phi::SizeOf(table_grad_t->dtype());
-  PADDLE_ENFORCE_EQ(line_mem_size % 64,
-                    0,
-                    platform::errors::InvalidArgument(
-                        "NPU only accept the second dim must align by 64"));
-  platform::NPUMemcpyAsync(
-      dst, pad_data, mem_size, ACL_MEMCPY_DEVICE_TO_DEVICE, stream, mem_size);
-}
-template <typename T>
-class CEmbeddingGradNPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *ids_t = context.Input<phi::DenseTensor>("Ids");
-    const auto &index_type = framework::TransToProtoVarType(ids_t->dtype());
-    if (index_type == framework::proto::VarType::INT32) {
-      NPUUpdateEmbedding<int32_t, T>(context);
-    } else {
-      PADDLE_THROW(
-          platform::errors::Unavailable("c_embedding ids only support int32."));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_embedding,
-                       ops::CEmbeddingNPUKernel<float>,
-                       ops::CEmbeddingNPUKernel<double>,
-                       ops::CEmbeddingNPUKernel<plat::float16>);
-REGISTER_OP_NPU_KERNEL(c_embedding_grad,
-                       ops::CEmbeddingGradNPUKernel<float>,
-                       ops::CEmbeddingGradNPUKernel<double>,
-                       ops::CEmbeddingGradNPUKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_identity_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_identity_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_identity_op.h"
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_identity,
-                       ops::CIdentityOpKernel<float, plat::NPUPlace>,
-                       ops::CIdentityOpKernel<double, plat::NPUPlace>,
-                       ops::CIdentityOpKernel<int, plat::NPUPlace>,
-                       ops::CIdentityOpKernel<int64_t, plat::NPUPlace>,
-                       ops::CIdentityOpKernel<plat::float16, plat::NPUPlace>);
--- a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_reduce_max,
-                       ops::CReduceOpASCENDKernel<ops::kRedMax, int>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMax, int8_t>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMax, float>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_reduce_min,
-                       ops::CReduceOpASCENDKernel<ops::kRedMin, int>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMin, int8_t>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMin, float>,
-                       ops::CReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_reduce_prod,
-                       ops::CReduceOpASCENDKernel<ops::kRedProd, int>,
-                       ops::CReduceOpASCENDKernel<ops::kRedProd, int8_t>,
-                       ops::CReduceOpASCENDKernel<ops::kRedProd, float>,
-                       ops::CReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_reduce_sum,
-                       ops::CReduceOpASCENDKernel<ops::kRedSum, int>,
-                       ops::CReduceOpASCENDKernel<ops::kRedSum, int8_t>,
-                       ops::CReduceOpASCENDKernel<ops::kRedSum, float>,
-                       ops::CReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(c_reduce_sum);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
-DECLARE_string(selected_npus);
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(3) << preStr << ":" << std::endl << debugstring;
-}
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  int rank_id = atoi(getenv("RANK_ID"));
-  int num1 = 3;
-  int num2 = 128;
-  std::vector<float> init;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-  auto place = ctx.GetPlace();
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
-  attrs["ring_id"] = 0;
-  int root_id = 0;
-  attrs["root_id"] = root_id;
-  auto op = f::OpRegistry::CreateOp(
-      "c_reduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  op->Run(*scope, place);
-  ctx.Wait();
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    if (rank_id == root_id) {
-      EXPECT_EQ(out_vec[i], 3.0);
-    } else {
-      EXPECT_EQ(out_vec[i], init[i]);
-    }
-  }
-}
-TEST(c_reduce_sum, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 2; i++) {
-    VLOG(2) << "iter num: " << i;
-    TestHCCLReduceOp(&scope, ctx, i);
-  }
-}
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(c_reducescatter,
-                       ops::CReduceScatterOpAscendKernel<int8_t>,
-                       ops::CReduceScatterOpAscendKernel<int>,
-                       ops::CReduceScatterOpAscendKernel<float>,
-                       ops::CReduceScatterOpAscendKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(c_reducescatter);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
-DECLARE_string(selected_npus);
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  std::vector<float> init;
-  int num1 = 4;
-  int num2 = 1;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0);
-  }
-  PrintDebugInfo("input data", init);
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-  attrs["nranks"] = 2;
-  auto op = f::OpRegistry::CreateOp(
-      "c_reducescatter", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  int iter_num = 10;
-  for (int i = 0; i < iter_num; i++) {
-    op->Run(*scope, place);
-    ctx.Wait();
-  }
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size() / 2);
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 2.0);
-  }
-}
-TEST(c_reducescatter, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLReduceScatterOp(&scope, ctx);
-}
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP_ITSELF(elementwise_add);
-USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
-USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU);
-template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  auto y = scope->Var("Y");
-  auto tensor_y = y->GetMutable<phi::DenseTensor>();
-  std::vector<T> init_x;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_x.push_back(static_cast<T>(1.0));
-  }
-  std::vector<T> init_y;
-  for (int64_t i = 0; i < 10 * 10; ++i) {
-    init_y.push_back(static_cast<T>(2.0));
-  }
-  paddle::framework::TensorFromVector(init_x, ctx, tensor_x);
-  tensor_x->Resize({10, 10});
-  paddle::framework::TensorFromVector(init_y, ctx, tensor_y);
-  tensor_y->Resize({10, 10});
-  f::AttributeMap attrs;
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  // sync data
-  auto sync_op0 = f::OpRegistry::CreateOp(
-      "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  sync_op0->Run(*scope, place);
-  // run
-  auto op = f::OpRegistry::CreateOp("elementwise_add",
-                                    {{"X", {"X"}}, {"Y", {"Y"}}},
-                                    {{"Out", {"Out"}}},
-                                    attrs);
-  op->Run(*scope, place);
-  // sync op run
-  auto sync_op = f::OpRegistry::CreateOp(
-      "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  sync_op->Run(*scope, place);
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  // sync op copy
-  auto sync_op2 = f::OpRegistry::CreateOp(
-      "c_sync_calc_stream", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
-  sync_op2->Run(*scope, place);
-  float expected = 3.0;
-  EXPECT_EQ(out_vec.size(), init_x.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], static_cast<T>(expected));
-  }
-}
-TEST(c_sync_calc_stream, NPU_fp32) {
-  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
-}
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(c_broadcast);
-USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
-DECLARE_string(selected_npus);
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  int num = 2;
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-  std::cout << "rank_id:" << rank_id << std::endl;
-  for (int64_t i = 0; i < num * num; ++i) {
-    init.push_back(1.0 + rank_id);
-    std::cout << init[0];
-  }
-  std::cout << std::endl;
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["root"] = 0;
-  attrs["ring_id"] = 0;
-  auto op = f::OpRegistry::CreateOp(
-      "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  op->Run(*scope, place);
-  // comm sync
-  auto sync_op = f::OpRegistry::CreateOp(
-      "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  sync_op->Run(*scope, place);
-  // ctx.Wait();
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 1.0);
-  }
-}
-TEST(c_sync_comm_stream_op, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLBroadcastOp(&scope, ctx);
-}
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <cmath>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(c_allreduce_sum);
-USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
-DECLARE_string(selected_npus);
-template <typename T>
-bool Check(T value, int size = 2 * 512 * 8192) {
-  f::Scope scope;
-  auto x = scope.Var("in");
-  auto& ctx = *dynamic_cast<p::NPUDeviceContext*>(
-      p::DeviceContextPool::Instance().Get(p::NPUPlace(0)));
-  auto place = ctx.GetPlace();
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  tensor_x->Resize({size});
-  tensor_x->mutable_data<T>(place);  // allocate
-  std::vector<T> init;
-  for (int64_t i = 0; i < size; ++i) {
-    init.push_back(static_cast<T>(value));
-  }
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x);
-  return result;
-}
-TEST(check_numeric, NPU) {
-  auto inf = std::numeric_limits<float>::infinity();
-  auto fp16_inf = static_cast<p::float16>(inf);
-  auto nan = NAN;
-  auto fp16_nan = static_cast<p::float16>(nan);
-  bool result = false;
-  // Normal
-  VLOG(0) << "start normal";
-  result = Check<p::float16>(static_cast<p::float16>(65546));
-  ASSERT_FALSE(result);
-  Check<float>(static_cast<float>(1.0));
-  ASSERT_FALSE(result);
-  // Inf
-  VLOG(0) << "start inf";
-  result = Check<p::float16>(fp16_inf);
-  ASSERT_FALSE(result);
-  result = Check<float>(inf);
-  ASSERT_FALSE(result);
-  // Nan
-  VLOG(0) << "start nan";
-  result = Check<p::float16>(fp16_nan);
-  ASSERT_TRUE(result);
-  result = Check<float>(nan);
-  ASSERT_TRUE(result);
-}
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-namespace paddle {
-namespace platform {
-struct ASCENDPlace;
-}  // namespace platform
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(
-    mp_allreduce_sum,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
--- a/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <memory>
-#include "paddle/fluid/operators/collective/partial_allgather_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CallPartialGatherOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(partial_allgather,
-                       ops::CallPartialGatherOpASCENDKernel<int8_t>,
-                       ops::CallPartialGatherOpASCENDKernel<int>,
-                       ops::CallPartialGatherOpASCENDKernel<float>,
-                       ops::CallPartialGatherOpASCENDKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/partial_recv_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/partial_recv_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class PartialRecvOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(partial_recv,
-                       ops::PartialRecvOpASCENDKernel<int>,
-                       ops::PartialRecvOpASCENDKernel<int8_t>,
-                       ops::PartialRecvOpASCENDKernel<float>,
-                       ops::PartialRecvOpASCENDKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/partial_send_op_npu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/send_v2_op.h"
-#include "paddle/fluid/platform/collective_helper.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class PartialSendOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(partial_send,
-                       ops::PartialSendOpASCENDKernel<int>,
-                       ops::PartialSendOpASCENDKernel<int8_t>,
-                       ops::PartialSendOpASCENDKernel<float>,
-                       ops::PartialSendOpASCENDKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/recv_v2_op.h"
-#include "paddle/fluid/distributed/collective/process_group.h"
-#include "paddle/phi/api/include/tensor.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CRecvOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(recv_v2,
-                       ops::CRecvOpASCENDKernel<int>,
-                       ops::CRecvOpASCENDKernel<int8_t>,
-                       ops::CRecvOpASCENDKernel<float>,
-                       ops::CRecvOpASCENDKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/operators/collective/recv_v2_op.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(recv_v2);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(recv_v2, NPU);
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
-  int num = atoi(getenv("DATA_SIZE"));
-  EXPECT_GT(num, 0);
-  EXPECT_LT(num, 1 << 15);
-  int rank_id = atoi(getenv("RANK_ID"));
-  VLOG(3) << "rank_id:" << rank_id << std::endl;
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Data");
-  auto tensor_out = out->GetMutable<phi::DenseTensor>();
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("srtest");
-  attrs["peer"] = atoi(getenv("SRC_RANK"));
-  attrs["ring_id"] = 0;
-  attrs["srTag"] = 0;
-  std::vector<int> out_shape;
-  out_shape.push_back(num);
-  out_shape.push_back(num);
-  attrs["out_shape"] = out_shape;
-  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
-  VLOG(3) << "CreateOp recv_v2";
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  VLOG(3) << "Run op recv_v2";
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
-  EXPECT_EQ(out_vec == init, true);
-}
-TEST(recv_v2, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  char* npu_id = getenv("FLAGS_selected_npus");
-  VLOG(3) << "Select npu:" << npu_id;
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHcomRecvOp(&scope, ctx);
-}
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/collective/send_v2_op.h"
-#include "paddle/fluid/distributed/collective/process_group.h"
-#include "paddle/phi/api/include/tensor.h"
-namespace paddle {
-namespace operators {
-template <typename T>
-class CSendOpASCENDKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with NPU."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_NPU_KERNEL(send_v2,
-                       ops::CSendOpASCENDKernel<int>,
-                       ops::CSendOpASCENDKernel<int8_t>,
-                       ops::CSendOpASCENDKernel<float>,
-                       ops::CSendOpASCENDKernel<plat::float16>);
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-#include <stdio.h>
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/operators/collective/send_v2_op.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-USE_OP(send_v2);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(send_v2, NPU);
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-  VLOG(3) << "break";
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-  memcpy(hccl_id, id, 1024);
-}
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-  memcpy(id, hccl_id, 1024);
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<phi::DenseTensor>();
-  int num = atoi(getenv("DATA_SIZE"));
-  EXPECT_GT(num, 0);
-  EXPECT_LT(num, 1 << 15);
-  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
-  int rank_id = atoi(getenv("RANK_ID"));
-  VLOG(3) << "rank id:" << rank_id;
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  ctx.Wait();
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("srtest");
-  attrs["peer"] = atoi(getenv("DEST_RANK"));
-  attrs["ring_id"] = 0;
-  attrs["srTag"] = 0;
-  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  VLOG(3) << "send run over";
-  ctx.Wait();
-}
-TEST(send_v2, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  char* npu_id = getenv("FLAGS_selected_npus");
-  VLOG(3) << "Select npu:" << npu_id;
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHcomSendOp(&scope, ctx);
-}