update Ascendrc hccl to 20.3 (#32126)

update Ascendrc hccl to 20.3 (#32126)

update Ascendrc hccl to 20.3 (#32126)
ce26f882 · lw921014 · GitHub · 75dd8423 · ce26f882 · ce26f882
37 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -400,6 +400,7 @@ OperatorBase::OperatorBase(const std::string& type,
  // framework::OpRegistry::CreateOp(type, {}, {}, {}, false).
  // Inputs, outputs and attrs will be set to empty map
  // to improve the execution efficiency of dygraph.
+
  if (inputs_.size() > 0 || outputs_.size() > 0) {
    GenerateTemporaryNames();
    CheckAllInputOutputSet();

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -31,6 +31,11 @@
 #endif
 #endif

+#ifdef PADDLE_WITH_ASCEND_CL
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#endif
+
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -45,6 +50,10 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+class Communicator;
+class HCCLCommunicator;
+#endif

 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -157,6 +166,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
    operators::CudnnRNNCache,
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo,
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
    BKCLUniqueId, platform::BKCLCommunicator,
 #endif

--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -11,7 +11,7 @@ foreach(src ${OPS})
    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()

-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})

 if(WITH_NCCL)
    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
@@ -24,39 +24,43 @@ if(WITH_GLOO)
 endif()

 if(WITH_XPU_BKCL)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper nccl_common)
    op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
    op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()

 if(WITH_ASCEND_CL)
-set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
+    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()

 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")

 if(WITH_ASCEND_CL)
-    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hcom_op op_registry ascend_hccl flags
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
+        gen_hccl_id_op op_registry ascend_hccl flags
        dynamic_loader dynload_warpctc scope device_context enforce executor)
    cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
        DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
-        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
-            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
        DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
        DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
+            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc 
-        DEPS op_registry c_broadcast_op c_comm_init_hcom_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
-    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc 
+    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
+        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
        DEPS op_registry elementwise_add_op c_sync_calc_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
 endif()
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -31,20 +31,19 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_ASCEND_CL)
    auto in = ctx.Input<framework::Tensor>("X");
    auto out = ctx.Output<framework::Tensor>("Out");
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());

    int ring_id = ctx.Attr<int>("ring_id");
    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
    auto place = ctx.GetPlace();
    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
    int nranks = comm->nranks();
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());

    framework::DDim out_dims = in->dims();
    out_dims[0] *= nranks;
    out->mutable_data<T>(out_dims, place);

-    int64_t send_numel = in->numel();
+    uint64_t send_numel = in->numel();
    void *send_buff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
    void *recv_buff = reinterpret_cast<void*>(out->data<T>());

@@ -59,12 +58,11 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
    VLOG(3) << "begin hccl allgather, parameter is: "
      << ", group is " << group
      << ", ring_id is " << ring_id
-      << ", nranks is " << nranks
-      << ", tag is " << tag;
+      << ", nranks is " << nranks;

-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_gather(
-        tag.c_str(), send_buff, recv_buff, (u64)send_numel, dtype,
-        group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
+        send_buff, recv_buff, send_numel, dtype,
+        comm->comm(), (void*)stream));

 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 USE_OP(c_allgather);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allgather, NPU);

 DECLARE_string(selected_npus);
@@ -56,26 +58,68 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
+  VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }

-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));

-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -83,7 +127,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {

 void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();

  std::vector<float> init;
@@ -102,7 +146,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();

  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -110,12 +154,12 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {

  // run
  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-  attrs["nranks"] = 2;
+  attrs["tag"]=std::string("tagx");
+  attrs["ring_id"]=0;
+  attrs["nranks"]=2;

-  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);

  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
@@ -139,11 +183,12 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {

 TEST(c_allgather, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;

  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));

-  Prepare(&scope, *ctx);
-  TestHCCLAllGatherOp(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllGatherOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 USE_OP(c_allreduce_max);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);

 DECLARE_string(selected_npus);
@@ -59,23 +61,65 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }

-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));

-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -83,7 +127,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {

 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();

  std::vector<float> init;
@@ -102,7 +146,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();

  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -113,8 +157,8 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
  attrs["tag"] = std::string("tagx");
  attrs["ring_id"] = 0;

-  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);

  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
@@ -135,11 +179,12 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {

 TEST(c_allreduce_max, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;

  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));

-  Prepare(&scope, *ctx);
-  TestHCCLAllReduceOp(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllReduceOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -117,34 +117,18 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-
-    // we need to pre-allocate 512 Bytes before the data
-    // and 512 Bytes after the data, so the hccl allreduce
-    // can work. This is a must acooding to huawei peer.
-    #define PRE_MALLOC_SIZE_BYTES 512
-
    auto in = ctx.Input<framework::LoDTensor>("X");
    auto out = ctx.Output<framework::LoDTensor>("Out");
    auto place = ctx.GetPlace();
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
    int64_t numel = in->numel();

-    int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T);
-    int64_t tmp_numel = numel + pre_tmp_size * 2;
-
-    paddle::framework::LoDTensor tmp_in, tmp_out;
-    tmp_in.Resize({tmp_numel});
-    tmp_out.Resize({tmp_numel});
-    auto p_tmp_in = tmp_in.mutable_data<T>(place);  // allocate
-    auto p_tmp_out = tmp_out.mutable_data<T>(place);  // allocate
-
-    void* sendbuff = reinterpret_cast<void*>(tmp_in.data<T>() + pre_tmp_size);
-    void* recvbuff = reinterpret_cast<void*>(tmp_out.data<T>() + pre_tmp_size);
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());

    int ring_id = ctx.Attr<int>("ring_id");
    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
    auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());

    aclrtStream stream = nullptr;
    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
@@ -154,33 +138,22 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
      stream = comm->stream();
    }

-    // we need to memset this memory firstly to avoid core by hccl
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_in), 0, tmp_numel*sizeof(T), stream);
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_out), 0, tmp_numel*sizeof(T), stream);
-
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
-
-    memory::Copy(npu_place, sendbuff,
-                 npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
-                 numel * sizeof(T),
-                 stream);
-
-    hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
    switch (red_type) {
      case kRedSum:
-        hccl_red_type = HCCL_REP_OP_SUM;
+        hccl_red_type = HCCL_REDUCE_SUM;
        break;

      case kRedMax:
-        hccl_red_type = HCCL_REP_OP_MAX;
+        hccl_red_type = HCCL_REDUCE_MAX;
        break;

      case kRedMin:
-        hccl_red_type = HCCL_REP_OP_MIN;
+        hccl_red_type = HCCL_REDUCE_MIN;
        break;

      case kRedProd:
-        hccl_red_type = HCCL_REP_OP_PROD;
+        hccl_red_type = HCCL_REDUCE_PROD;
        break;

      default:
@@ -192,16 +165,10 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
      << "input num: " << numel
      << "dtype: " << dtype
      << "hccl_red_type: " << hccl_red_type
-      << ", group is: " << group
-      << ", tag is " << tag;
-
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
-        tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
+      << ", group is: " << group;

-    memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
-                 npu_place, recvbuff,
-                 numel * sizeof(T),
-                 stream);
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), (void*)stream));

    out->Resize(in->dims());
 #else

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"

 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 USE_OP(c_allreduce_sum);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);

 DECLARE_string(selected_npus);
@@ -56,23 +58,65 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }

-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));

-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -81,7 +125,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
                         int iter) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();

  int rank_id = atoi(getenv("RANK_ID"));
@@ -100,7 +144,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
  tensor_x->Resize({num1, num2});
  ctx.Wait();

-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -111,8 +155,10 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
  attrs["ring_id"] = 0;

-  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum",
+                                    {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}},
+                                    attrs);

  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
@@ -133,14 +179,17 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,

 TEST(c_allreduce_sum, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;

  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  // auto* ctx = p::DeviceContextPool::Instance().Get(
+      // p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));

-  Prepare(&scope, *ctx);
-  for (int i = 0; i < 1; i++) {
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for(int i = 0; i < 1; i ++){
    VLOG(2) << "iter num: " << i;
-    TestHCCLAllReduceOp(&scope, *ctx, i);
+    TestHCCLAllReduceOp(&scope, ctx, i);
  }
 }
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -30,7 +30,7 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
    auto x = ctx.Input<framework::LoDTensor>("X");
    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
    int numel = x->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());

    auto out = ctx.Output<framework::LoDTensor>("Out");

@@ -48,14 +48,12 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {

    int root = ctx.Attr<int>("root");
    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());

    VLOG(3) << "begin hccl broadcast, parameter is: "<< "root " << root
-      << ", group is " << group
-      << ", tag is " << tag;
+      << ", group is " << group << ", comm: " << comm->comm() << ", stream: " << stream;

-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
-                                  dtype, (uint32_t)root, group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
+                                  dtype, (uint32_t)root, comm->comm(), stream));

    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
            << framework::product(out->dims());

--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"

 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 USE_OP(c_broadcast);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_broadcast, NPU);

 DECLARE_string(selected_npus);
@@ -53,26 +55,68 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
+  VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }

-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));

-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -80,7 +124,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {

 void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  int num = 2;
  std::vector<float> init;
@@ -96,7 +140,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();

  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num, num});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -108,8 +152,8 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
  attrs["root"] = 0;
  attrs["ring_id"] = 0;

-  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);

  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
@@ -129,11 +173,11 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {

 TEST(c_broadcast, NPU) {
  f::Scope scope;
-
+  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));

-  Prepare(&scope, *ctx);
-  TestHCCLBroadcastOp(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_comm_init_hcom_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hcom_op.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,66 +12,67 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/hccl_helper.h"
-
 #include <string>
-#include <vector>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/npu_op_runner.h"

 namespace paddle {
 namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif

 namespace paddle {
 namespace operators {

-class CCommInitOpNPU : public framework::OperatorBase {
+class CCommInitOpAscend : public framework::OperatorBase {
 public:
-  CCommInitOpNPU(const std::string& type,
-              const framework::VariableNameMap& inputs,
+  CCommInitOpAscend(const std::string& type, const framework::VariableNameMap& inputs,
              const framework::VariableNameMap& outputs,
              const framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}

  void RunImpl(const framework::Scope& scope,
               const platform::Place& place) const override {
-    int rid = Attr<int>("ring_id");
-    int nranks = Attr<int>("nranks");
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
+                      platform::errors::PreconditionNotMet(
+                          "CCommInitOpAscend can run on npu place only."));
+
+    auto var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input con not be empty."));
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
+
+    int rank_ids = Attr<int>("rank_ids");
    int rank_id = Attr<int>("rank");
+    int rid = Attr<int>("ring_id");
    int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
    if (Attr<int>("device_id") >= 0) {
      device_id = Attr<int>("device_id");
    }
-    std::vector<int> rank_ids = Attr<std::vector<int>>("rank_ids");
-
-    VLOG(3) << "begin c_comm_init on npu, parameters are: "
-            << "ring id[" << rid
-            << "], nranks[" << nranks
-            << "], rank_id[" << rank_id
-            << "], device_id[" << device_id
-            << "]";
-
    platform::HCCLCommContext::Instance().CreateHCCLComm(
-        rank_ids, rank_id, device_id, rid);
+        hccl_id, rank_ids, rank_id, device_id, rid);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
  }
 };

-class CCommInitOpNPUMaker : public framework::OpProtoAndCheckerMaker {
+class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
+    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
    AddComment(R"DOC(
-CCommInit operator on NPU
+CCommInit operator

-Initialize collective communication context within this trainer
+Initialize collective communicatoin context within this trainer
 )DOC");
-    AddAttr<int>("nranks", "(int) The number of ranks of distributed trainers");
-    AddAttr<std::vector<int>>("rank_ids", "The world rank ids of the group");
+    AddAttr<int>("rank_ids", "(int) The number of ranks of distributed trainers");
    AddAttr<int>("rank",
                 "(int) The rank of the trainer in distributed training.");
    AddAttr<int>("device_id",
@@ -89,6 +90,4 @@ Initialize collective communication context within this trainer

 namespace ops = paddle::operators;

-REGISTER_OPERATOR(c_comm_init_hcom, ops::CCommInitOpNPU, ops::CCommInitOpNPUMaker);
-
-#endif
+REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend, ops::CCommInitOpAscendMaker);
--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+  }
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int rank = Attr<int>("rank");
+    framework::Scope& local_scope = scope.NewScope();
+
+    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
+      return Output("Out");
+    };
+
+    if (rank == 0) {
+      std::vector<std::string> endpoint_list =
+          Attr<std::vector<std::string>>("other_endpoints");
+      SendBroadCastHCCLID(endpoint_list, 1, func, local_scope);
+    } else {
+      std::string endpoint = Attr<std::string>("endpoint");
+      RecvBroadCastHCCLID(endpoint, 1, func, local_scope);
+    }
+    scope.DeleteScope(&local_scope);
+  }
+};
+
+#else
+
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+  }
+};
+
+#endif
+
+class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    VLOG(3) << "ele";
+    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CGenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "other_endpoints",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of other trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("rank",
+                 "(int default 0) "
+                 "The rank of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -63,6 +63,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
 };

 #else
+
 class CGenNCCLIdOp : public framework::OperatorBase {
 public:
  CGenNCCLIdOp(const std::string& type,

--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -121,31 +121,15 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-
-    // we need to pre-allocate 512 Bytes before the data
-    // and 512 Bytes after the data, so the hccl allreduce
-    // can work. This is a must acooding to huawei peer.
-    #define PRE_MALLOC_SIZE_BYTES 512
-
    auto in = ctx.Input<framework::LoDTensor>("X");
    auto out = ctx.Output<framework::LoDTensor>("Out");
    auto place = ctx.GetPlace();
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
    int64_t numel = in->numel();

-    int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T);
-    int64_t tmp_numel = numel + pre_tmp_size * 2;
-
-    paddle::framework::LoDTensor tmp_in, tmp_out;
-    tmp_in.Resize({tmp_numel});
-    tmp_out.Resize({tmp_numel});
-    auto p_tmp_in = tmp_in.mutable_data<T>(place);  // allocate
-    auto p_tmp_out = tmp_out.mutable_data<T>(place);  // allocate
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());

-    void* sendbuff = reinterpret_cast<void*>(tmp_in.data<T>() + pre_tmp_size);
-    void* recvbuff = reinterpret_cast<void*>(tmp_out.data<T>() + pre_tmp_size);
-
-    std::string tag = ctx.Attr<std::string>("tag");
    int ring_id = ctx.Attr<int>("ring_id");
    int root_id = ctx.Attr<int>("root_id");
    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
@@ -161,33 +145,22 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {

    int rank_id = comm->rank();

-    // we need to memset this memory firstly to avoid core by hccl
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_in), 0, tmp_numel*sizeof(T), stream);
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_out), 0, tmp_numel*sizeof(T), stream);
-
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
-
-    memory::Copy(npu_place, sendbuff,
-                 npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
-                 numel * sizeof(T),
-                 stream);
-
-    hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
    switch (red_type) {
      case kRedSum:
-        hccl_red_type = HCCL_REP_OP_SUM;
+        hccl_red_type = HCCL_REDUCE_SUM;
        break;

      case kRedMax:
-        hccl_red_type = HCCL_REP_OP_MAX;
+        hccl_red_type = HCCL_REDUCE_MAX;
        break;

      case kRedMin:
-        hccl_red_type = HCCL_REP_OP_MIN;
+        hccl_red_type = HCCL_REDUCE_MIN;
        break;

      case kRedProd:
-        hccl_red_type = HCCL_REP_OP_PROD;
+        hccl_red_type = HCCL_REDUCE_PROD;
        break;

      default:
@@ -200,18 +173,14 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
      << "root_id: " << root_id
      << "dtype: " << dtype
      << "hccl_red_type: " << hccl_red_type
-      << ", group is: " << group
-      << ", tag is " << tag;
+      << ", group is: " << group;

-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
-        tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), (void*)stream));

-    if(rank_id == root_id){
-      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
-                  npu_place, recvbuff,
-                  numel * sizeof(T),
-                  stream);
-    }else{
+
+    if(rank_id != root_id){
+      auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
            npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
            numel * sizeof(T),

--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"

 #include "paddle/fluid/operators/collective/c_reduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 USE_OP(c_reduce_sum);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);

 DECLARE_string(selected_npus);
@@ -56,23 +58,65 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }

-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));

-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -80,7 +124,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {

 void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();

  int rank_id = atoi(getenv("RANK_ID"));
@@ -99,7 +143,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  tensor_x->Resize({num1, num2});
  ctx.Wait();

-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -112,8 +156,10 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  int root_id = 0;
  attrs["root_id"] = root_id;

-  auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_reduce_sum",
+                                    {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}},
+                                    attrs);

  op->Run(*scope, place);
  ctx.Wait();
@@ -136,14 +182,15 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {

 TEST(c_reduce_sum, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;

  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));

-  Prepare(&scope, *ctx);
-  for (int i = 0; i < 2; i++) {
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for(int i = 0; i < 2; i ++){
    VLOG(2) << "iter num: " << i;
-    TestHCCLReduceOp(&scope, *ctx, i);
+    TestHCCLReduceOp(&scope, ctx, i);
  }
 }
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -35,7 +35,6 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();
    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
    int nranks = comm->nranks();
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());

    auto out_dims = in->dims();
    PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
@@ -47,11 +46,11 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
    out_dims[0] = out_dims[0] / nranks;
    out->mutable_data<T>(out_dims, place);

-    int64_t recv_numel = in->numel() / nranks;
+    uint64_t recv_numel = in->numel() / nranks;

    void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
    void* outputPtr = reinterpret_cast<void*>(out->data<T>());
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());

    aclrtStream stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
@@ -63,12 +62,11 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
    VLOG(3) << "begin hccl reduce scatter, parameter is: "
      << "recv_numel: " << recv_numel
      << "dtype: " << dtype
-      << "hccl_red_type: " << HCCL_REP_OP_SUM
-      << ", group is: " << group
-      << ", tag is " << tag;
+      << "hccl_red_type: " << HCCL_REDUCE_SUM
+      << ", group is: " << group;

-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_reduce_scatter(
-        tag.c_str(), inputPtr, outputPtr, (u64)recv_numel, dtype, HCCL_REP_OP_SUM, group.c_str(), (void*)stream));
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter(
+        inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(), (void*)stream));
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));

--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 USE_OP(c_reducescatter);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);

 DECLARE_string(selected_npus);
@@ -59,7 +61,8 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }

-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));

@@ -68,22 +71,63 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }

+
 void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();

  std::vector<float> init;
@@ -101,7 +145,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();

  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -114,14 +158,14 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  attrs["ring_id"] = 0;
  attrs["nranks"] = 2;

-  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}},
+                              {{"Out", {"OutData"}}}, attrs);

  int iter_num = 10;
  for (int i = 0; i < iter_num; i++) {
    op->Run(*scope, place);
+    ctx.Wait();
  }
-  ctx.Wait();

  std::vector<float> out_vec;
  TensorToVector(*tensor_out, ctx, &out_vec);
@@ -130,17 +174,18 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  PrintDebugInfo("output data", out_vec);
  EXPECT_EQ(out_vec.size(), init.size() / 2);
  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], iter_num + 1);
+    EXPECT_EQ(out_vec[i], 2.0);
  }
 }

 TEST(c_reducescatter, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;

  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));

-  Prepare(&scope, *ctx);
-  TestHCCLReduceScatterOp(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLReduceScatterOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -41,7 +41,7 @@ namespace m = paddle::operators::math;

 USE_OP(c_broadcast);
 USE_NO_KERNEL_OP(c_sync_comm_stream);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_broadcast, NPU);

 void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {

--- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+
+namespace paddle {
+namespace operators {
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    std::vector<std::string> trainers =
+        Attr<std::vector<std::string>>("trainers");
+    int trainer_id = Attr<int>("trainer_id");
+    std::string endpoint = trainers[trainer_id];
+
+    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
+                                         "trainer_id %d is less than 0. Its "
+                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_LT(
+        trainer_id, static_cast<int>(trainers.size()),
+        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                     "range is [0, trainer_size)",
+                                     trainer_id));
+
+    int hccl_comm_num = Attr<int>("hccl_comm_num");
+    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
+    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
+    int inter_trainer_id = -1;
+    int exter_trainer_id = -1;
+
+    if (use_hierarchical_allreduce) {
+      PADDLE_ENFORCE_GT(
+          trainers.size(), 1,
+          platform::errors::PreconditionNotMet(
+              "The number of collective trainers %llu <= 1", trainers.size()));
+      PADDLE_ENFORCE_GT(
+          inter_nranks, 1,
+          platform::errors::PreconditionNotMet(
+              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
+              inter_nranks));
+      PADDLE_ENFORCE_EQ(
+          trainers.size() % inter_nranks, 0,
+          platform::errors::PreconditionNotMet(
+              "The number of trainers %llu mod inter_nranks %d is not equal 0",
+              trainers.size(), inter_nranks));
+
+      inter_trainer_id = trainer_id % inter_nranks;
+
+      if (trainer_id % inter_nranks == 0) {
+        exter_trainer_id = trainer_id / inter_nranks;
+      }
+    }
+
+    std::ostringstream ss;
+    for (size_t i = 0; i < trainers.size(); i++) {
+      ss << trainers[i] << ",";
+    }
+
+    VLOG(1) << "trainer_id:" << trainer_id
+            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
+            << ", hccl_comm_num:" << hccl_comm_num
+            << ", inter_nranks:" << inter_nranks
+            << ", inter_trainer_id:" << inter_trainer_id
+            << ", exter_trainer_id:" << exter_trainer_id
+            << ", trainers:" << ss.str();
+
+    int server_fd = -1;
+
+    /// 1. init flat
+    std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
+    if (trainer_id == 0) {
+      // server endpoints
+      std::vector<std::string> flat_endpoints;
+      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
+                            trainers.end());
+      SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
+    } else {
+      server_fd = CreateListenSocket(endpoint);
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 2. hierarchical inter ncclid
+    func = platform::GetHierarchicalInterHCCLVarName;
+    if (inter_trainer_id == 0) {
+      std::ostringstream ss;
+      ss << endpoint;
+      std::vector<std::string> inter_endpoints;
+      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
+                                   i < static_cast<int>(trainers.size());
+           i++) {
+        ss << ",";
+        inter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
+    } else if (inter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical inter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    /// 3. hierarchical exter ncclid
+    func = platform::GetHierarchicalExterHCCLVarName;
+    if (exter_trainer_id == 0) {
+      std::ostringstream ss;
+      std::vector<std::string> exter_endpoints;
+      ss << endpoint;
+      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
+        ss << ",";
+        exter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
+
+      SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
+    } else if (exter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical exter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+
+    // close socket server
+    if (trainer_id != 0) {
+      CloseSocket(server_fd);
+    }
+  }
+};
+
+#else
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+  }
+};
+
+#endif
+
+class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenHCCLId operator
+
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::vector<std::string>>(
+        "trainers",
+        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
+        "list of all trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int) "
+                 "The index of the trainer in distributed training.");
+    AddAttr<int>("hccl_comm_num",
+                 "(int default 1) "
+                 "The number of nccl communicator num.")
+        .SetDefault(1);
+    AddAttr<bool>("use_hierarchical_allreduce",
+                  "(bool default false) "
+                  "Wheter to use hierarchical allreduce.")
+        .SetDefault(false);
+    AddAttr<int>("hierarchical_allreduce_inter_nranks",
+                 "(int default 1) "
+                 "Wheter to use hierarchical allreduce.")
+        .SetDefault(-1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+
+#include <algorithm>
+#include <ostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
+#define HCCL_UNIQUE_ID_BYTES 1024
+
+// Check system calls, such as socket, bind.
+#define CHECK_SYS_CALL(call, name)          \
+  do {                                      \
+    int retval;                             \
+    CHECK_SYS_CALL_VAL(call, name, retval); \
+  } while (false)
+
+#define CHECK_SYS_CALL_VAL(call, name, retval)                            \
+  do {                                                                    \
+    RETRY_SYS_CALL_VAL(call, name, retval);                               \
+    if (retval == -1) {                                                   \
+      PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \
+                                                 name, strerror(errno))); \
+    }                                                                     \
+  } while (false)
+
+#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
+  do {                                                                   \
+    retval = (call);                                                     \
+    if (retval == -1 &&                                                  \
+        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
+      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
+                   << " retry";                                          \
+    } else {                                                             \
+      break;                                                             \
+    }                                                                    \
+  } while (true)
+
+static int SocketSend(int fd, const char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = send(fd, buffer + offset, size - offset, 0);
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        // send failed
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static int SocketRecv(int fd, char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = recv(fd, buffer + offset, size - offset, 0);
+    if (bytes == 0) {
+      // closed by client, maybe probing alive client
+      return 0;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+
+static void BindOrConnectFailed(int timeout, int* try_times, int* total_time,
+                                const char* op, const std::string& ep) {
+  PADDLE_ENFORCE_LT(
+      *total_time, timeout,
+      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op,
+                                    ep.c_str(), strerror(errno)));
+  ++(*try_times);
+  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
+  *total_time += retry_time;
+
+  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
+               << " times with reason: " << strerror(errno) << " retry after "
+               << retry_time / 1000.0 << " seconds";
+  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
+}
+
+int CreateListenSocket(const std::string& ep) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  // creating socket fd
+  int server_fd = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
+
+  // NOTE. Solutions to `Address already in use`.
+  // 1. Reuse addr&port. Otherwise, once the server closes the socket
+  // before client, the server will enter TIME-WAIT status. If we bind port
+  // again, the error `Address already in use` will appear.
+  // 2. Or we can close the client first to ensure that the server does
+  // not enter the TIME-WAIT state. But this is obviously not as convenient
+  // as the reuse method.
+  int opt = 1;
+#if defined(SO_REUSEPORT)
+  // since Linux kernel 3.9
+  CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+                            &opt, sizeof(opt)),
+                 "setsockopt");
+#else
+  CHECK_SYS_CALL(
+      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+      "setsockopt");
+#endif
+
+  struct sockaddr_in address;
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind",
+        ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
+      continue;
+    }
+    break;
+  }
+
+  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
+  LOG(INFO) << "Server listening on: " << ep << " successful.";
+  return server_fd;
+}
+
+void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
+
+static int SocketAccept(int server_fd, const char* head) {
+  struct sockaddr_in client_addr;
+  socklen_t addr_length = sizeof(client_addr);
+  char buffer[1024] = {0};
+  int conn = -1;
+
+  while (true) {
+    CHECK_SYS_CALL_VAL(
+        accept(server_fd, reinterpret_cast<struct sockaddr*>(&client_addr),
+               &addr_length),
+        "accept", conn);
+
+    int ret_val = SocketRecv(conn, buffer, strlen(head));
+    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
+      break;  // accept client
+    } else {
+      VLOG(3) << "socket read failed with ret_val=" << ret_val;
+      CloseSocket(conn);
+    }
+  }
+  return conn;
+}
+
+static int ConnectAddr(const std::string& ep, const char* head) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+
+  int sock = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
+
+  struct sockaddr_in server_addr;
+  memset(&server_addr, 0, sizeof(server_addr));
+  server_addr.sin_family = AF_INET;
+  server_addr.sin_port = htons(port);
+
+  char* ip = NULL;
+  struct hostent* hp = NULL;
+  hp = gethostbyname(host.c_str());
+  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
+                                  "Fail to get host by name %s.", host));
+
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+
+  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0,
+                    platform::errors::Unavailable("Open address %s failed: %s",
+                                                  ep, strerror(errno)));
+
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
+        "connect", ret_val);
+
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
+      continue;
+    }
+
+    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
+    break;
+  }
+  return sock;
+}
+
+static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
+                "hccl id bytes must <= buffer size");
+
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES), "recv hccl id");
+  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
+}
+
+static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
+
+  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "send hccl id");
+}
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  // connect with server
+  std::vector<int> connects;
+  for (auto server : servers) {
+    VLOG(3) << "connecting endpoint: " << server;
+    int conn = ConnectAddr(server, COMM_HEAD);
+    connects.push_back(conn);
+  }
+  VLOG(3) << "connecting completed...";
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
+
+    int j = 0;
+    for (auto conn : connects) {
+      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
+              << " hccl_comm_no: " << i;
+      SendHCCLID(conn, hccl_id);
+      ++j;
+    }
+    VLOG(3) << "sending completed...";
+  }
+
+  // close client
+  for (auto conn : connects) {
+    CloseSocket(conn);
+  }
+}
+
+void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int server = CreateListenSocket(endpoint);
+  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
+  CloseSocket(server);
+}
+
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int client = SocketAccept(server_fd, COMM_HEAD);
+
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+
+    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
+            << " from trainer 0, hccl_comm_no: " << i;
+    RecvHCCLID(client, hccl_id);
+  }
+  VLOG(3) << "receiving completed...";
+  CloseSocket(client);
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+int CreateListenSocket(const std::string& ep);
+
+void CloseSocket(int fd);
+
+void SendBroadCastHCCLID(std::vector<std::string> servers, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// server listen on endpoint, then recv nccl id
+void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+
+// recv nccl id from socket
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -27,32 +27,39 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto out = ctx.Output<framework::LoDTensor>("Out");
-    int numel = out->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(out->type());
+    auto x = ctx.Output<framework::LoDTensor>("Out");
+    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());

    int ring_id = ctx.Attr<int>("ring_id");
    auto place = ctx.GetPlace();
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);

    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
-    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    int srcRank = ctx.Attr<int>("peer");
-    int srTag = ctx.Attr<int>("srTag");
-    VLOG(3) << "recv_v2_npu attr get";
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_receive(
-        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(out->data<T>())), (u64)numel, dtype, srcRank,
-          srTag, group.c_str(), stream));
-     VLOG(3) << "Source Rank: " << srcRank << " Invoke hcom receive. receiving ";
-    out->Resize(out->dims());
-    out->set_lod(out->lod());
+
+    int nranks = comm->nranks();
+    int peer = ctx.Attr<int>("peer");
+
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                platform::errors::InvalidArgument(
+                    "The nranks must be 2, but (%d)",
+                      nranks));
+
+    int root = peer;
+
+    VLOG(3) << "begin hccl recv, parameter is: "<< "root " << root
+      << ", comm: " << comm->comm() << ", stream: " << stream;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
+                                  dtype, (uint32_t)root, comm->comm(), stream));
+
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));

--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -31,6 +31,8 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"

 #include "paddle/fluid/operators/collective/recv_v2_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,83 +44,126 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 USE_OP(recv_v2);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(recv_v2, NPU);

-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::string rank_table_file = getenv("RANK_TABLE_FILE");
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  int src_rank = atoi(getenv("SRC_RANK"));
-  int dest_rank = atoi(getenv("DEST_RANK"));
-  VLOG(3) << "rank_id " << rank_id << "src_rank" << src_rank << "dest_rank"
-          << dest_rank;

-  std::vector<int> rank_ids = {0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
-  VLOG(3) << "CreateOp c_comm_init_hcom";
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
 }

-void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);

-  int num = atoi(getenv("DATA_SIZE"));
-  EXPECT_GT(num, 0);
-  EXPECT_LT(num, 1 << 15);
  int rank_id = atoi(getenv("RANK_ID"));
-  VLOG(3) << "rank_id:" << rank_id << std::endl;
+  int device_id = atoi(getenv("DEVICE_ID"));

-  ctx.Wait();
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));

+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
  ctx.Wait();
+}

-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("srtest");
-  attrs["peer"] = atoi(getenv("SRC_RANK"));
-  attrs["ring_id"] = 0;
-  attrs["srTag"] = 0;
-  std::vector<int> out_shape;
-  out_shape.push_back(num);
-  out_shape.push_back(num);
-  attrs["out_shape"] = out_shape;
-
-  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
-  VLOG(3) << "CreateOp recv_v2";
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  VLOG(3) << "Run op recv_v2";
-  std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
-  EXPECT_EQ(out_vec == init, true);
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
+    std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+
+    int num = atoi(getenv("DATA_SIZE"));
+    EXPECT_GT(num, 0);
+    EXPECT_LT(num, 1 << 15);
+    int rank_id = atoi(getenv("RANK_ID"));
+    VLOG(3) << "rank_id:" << rank_id<<std::endl;
+
+    ctx.Wait();
+    auto place = ctx.GetPlace();
+    auto out = scope->Var("Data");
+    auto tensor_out = out->GetMutable<f::LoDTensor>();
+    tensor_out->Resize({num, num});
+    tensor_out->mutable_data<float>(place);  // allocate
+
+    ctx.Wait();
+
+    f::AttributeMap attrs;
+    attrs["tag"]=std::string("srtest");
+    attrs["peer"]=atoi(getenv("SRC_RANK"));
+    attrs["ring_id"]=0;
+    attrs["srTag"]=0;
+    std::vector<int> out_shape;
+    out_shape.push_back(num);
+    out_shape.push_back(num);
+    attrs["out_shape"]=out_shape;
+
+    auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
+    VLOG(3) << "CreateOp recv_v2";
+
+    for (int i = 0; i < 10; i ++) {
+      op->Run(*scope, place);
+    }
+    VLOG(3) << "Run op recv_v2";
+    std::vector<float> out_vec;
+    TensorToVector(*tensor_out, ctx, &out_vec);
+    ctx.Wait();
+    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+    EXPECT_EQ(out_vec == init, true);
 }

-TEST(recv_v2, NPU) {
-  f::Scope scope;
-  char* npu_id = getenv("FLAGS_selected_npus");
-  VLOG(3) << "Select npu:" << npu_id;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
-  VLOG(3) << "Place over";
-  Prepare(&scope, *ctx);
-  VLOG(3) << "Prepare over";
-  TestHcomRecvOp(&scope, *ctx);
-  VLOG(3) << "Test over";
+
+TEST(recv_v2, NPU){
+    f::Scope scope;
+    HcclRootInfo hccl_id;
+
+    char * npu_id=getenv("FLAGS_selected_npus");
+    VLOG(3) << "Select npu:" << npu_id;
+    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+
+    PrepareUniqueId(&scope, ctx, &hccl_id);
+    Prepare(&scope, ctx, &hccl_id);
+    TestHcomRecvOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -28,31 +28,37 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
    auto x = ctx.Input<framework::LoDTensor>("X");
+    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
    int numel = x->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());

-    auto place = ctx.GetPlace();
    int ring_id = ctx.Attr<int>("ring_id");
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    auto place = ctx.GetPlace();
+    auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);

    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
-    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    int destRank = ctx.Attr<int>("peer");
-    int srTag = ctx.Attr<int>("srTag");

-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_send(
-        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), (u64)numel, dtype, destRank,
-          srTag, group.c_str(), stream));
+    int nranks = comm->nranks();
+    int rank = comm->rank();
+
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                  platform::errors::InvalidArgument(
+                      "The nranks must be 2, but (%d)",
+                       nranks));
+
+    int root = rank;
+
+    VLOG(3) << "begin hccl send, parameter is: "<< "root " << root
+      << ", comm: " << comm->comm() << ", stream: " << stream;

-      VLOG(3) << "Dest rank:" << destRank << " Invoke hcom send. Sent "
-              << x->numel();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
+                                  dtype, (uint32_t)root, comm->comm(), stream));

 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"

 #include "paddle/fluid/operators/collective/send_v2_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"

 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -41,72 +42,115 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;

 USE_OP(send_v2);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(send_v2, NPU);

-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::string rank_table_file = getenv("RANK_TABLE_FILE");
+
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  int src_rank = atoi(getenv("SRC_RANK"));
-  int dest_rank = atoi(getenv("DEST_RANK"));
-  VLOG(3) << "rank_id " << rank_id << "src_rank" << src_rank << "dest_rank"
-          << dest_rank;

-  std::vector<int> rank_ids = {0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
 }

-void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+
  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-  int num = atoi(getenv("DATA_SIZE"));
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);

-  EXPECT_GT(num, 0);
-  EXPECT_LT(num, 1 << 15);
-  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
  int rank_id = atoi(getenv("RANK_ID"));
-  VLOG(3) << "rank id:" << rank_id;
-  TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-  ctx.Wait();
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
  ctx.Wait();
+}

-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("srtest");
-  attrs["peer"] = atoi(getenv("DEST_RANK"));
-  attrs["ring_id"] = 0;
-  attrs["srTag"] = 0;
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
+    std::cout<< "BEGIN TEST:"<< __FUNCTION__ <<std::endl;
+    auto x = scope->Var("Data");
+    auto tensor_x = x->GetMutable<f::LoDTensor>();
+    int num = atoi(getenv("DATA_SIZE"));;
+    EXPECT_GT(num, 0);
+    EXPECT_LT(num, 1 << 15);
+    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+    int rank_id = atoi(getenv("RANK_ID"));
+    VLOG(3)<<"rank id:"<<rank_id;
+    TensorFromVector(init, ctx, tensor_x);
+    tensor_x->Resize({num, num});
+    ctx.Wait();
+    auto place = ctx.GetPlace();
+    ctx.Wait();
+
+    f::AttributeMap attrs;
+    attrs["tag"]=std::string("srtest");
+    attrs["peer"]=atoi(getenv("DEST_RANK"));
+    attrs["ring_id"]=0;
+    attrs["srTag"]=0;
+
+    auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
+
+    for (int i = 0; i < 10; i ++) {
+      op->Run(*scope, place);
+    }
+    VLOG(3)<<"send run over";
+    ctx.Wait();
+}

-  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
+TEST(send_v2, NPU){
+    f::Scope scope;
+    HcclRootInfo hccl_id;

-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  VLOG(3) << "send run over";
-  ctx.Wait();
-}
+    char * npu_id=getenv("FLAGS_selected_npus");
+    VLOG(3) << "Select npu:" << npu_id;
+    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));

-TEST(send_v2, NPU) {
-  f::Scope scope;
-  char* npu_id = getenv("FLAGS_selected_npus");
-  VLOG(3) << "Select npu:" << npu_id;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
-  VLOG(3) << "Place over";
-  Prepare(&scope, *ctx);
-  VLOG(3) << "Prepare over";
-  TestHcomSendOp(&scope, *ctx);
-  VLOG(3) << "Test over";
+    PrepareUniqueId(&scope, ctx, &hccl_id);
+    Prepare(&scope, ctx, &hccl_id);
+    TestHcomSendOp(&scope, ctx);
 }
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -157,15 +157,10 @@ class HCCLComm {
  virtual int nranks() const = 0;
  virtual int rank() const = 0;
  virtual int device_id() const = 0;
+  virtual HcclComm comm() const = 0;
  virtual aclrtStream stream() const = 0;
  virtual NPUDeviceContext* dev_context() const = 0;
  virtual ~HCCLComm() = default;
-
-  unsigned long NextTagId() {
-    return tag_counter_++;
-  }
- private:
-  std::atomic<unsigned long> tag_counter_;
 };

 // A singleton HCCL communicator context reserves communication ring ids
@@ -176,11 +171,12 @@ class HCCLCommContext {
    return comm_ctx;
  }

-  HCCLComm* CreateHCCLComm(const std::vector<int>& world_rank_ids, int rank, int dev_id,  int ring_id = 0);
-
+  HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                          int rank, int dev_id, int ring_id);
  // a latter comm with the same dev_id and the same ring_id
  // will override the former
-  HCCLComm* AssignHCCLComm(int nranks, int rank, int dev_id, int ring_id = 0);
+  HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id);

  // retrieve a communicator by the ring id in multiprocessing mode
  HCCLComm* Get(int ring_id) const {
@@ -217,20 +213,21 @@ class HCCLCommContext {

 private:
  // Init global hcom
-  HCCLCommContext() { InitHcomWorldGroup(); }
+  HCCLCommContext() {}
+  // we may use group feature in the feature
+  // HCCLCommContext() { InitHcomWorldGroup(); }

+  HcclComm comm_;

 public:
-  ~HCCLCommContext(){
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_destroy());
-  }
+  ~HCCLCommContext(){ }

  std::once_flag once_flag_;
  std::mutex comm_map_mutex_;
  // ring id to dev-HCCLComm
  std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;

-  void InitHcomWorldGroup();
+  // void InitHcomWorldGroup();
  void ReleaseHCCLComms();

  DISABLE_COPY_AND_ASSIGN(HCCLCommContext);

--- a/paddle/fluid/platform/collective_helper_npu.cc
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@@ -34,6 +34,13 @@ class HCCLCommImpl : public HCCLComm {
    return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
  }

+  ~HCCLCommImpl(){
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
+  }
+
+  void set_comm(HcclComm comm) { comm_ = comm; }
+  HcclComm comm() const override { return comm_; }
+
  aclrtStream stream() const override { return dev_ctx_->stream(); }

  void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
@@ -45,46 +52,43 @@ class HCCLCommImpl : public HCCLComm {
  int ring_id_;
  int nranks_;
  int rank_;
+  HcclComm comm_;
  std::unique_ptr<NPUDeviceContext> dev_ctx_;
 };

-HCCLComm* HCCLCommContext::CreateHCCLComm(const std::vector<int>& world_rank_ids, int rank, int dev_id, int ring_id) {
+HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                                          int rank, int dev_id, int ring_id) {
+  PADDLE_ENFORCE_NOT_NULL(hccl_id,
+                          platform::errors::InvalidArgument(
+                              "The hccl unique id should not be null."));
  PADDLE_ENFORCE_GT(
-      world_rank_ids.size(), 1,
+      nranks, 1,
      platform::errors::InvalidArgument(
-          "Expected world_rank_ids.size() > 1. But received size is %d.", world_rank_ids.size()));
+          "Expected nranks > 1. But received nranks is %d.", nranks));
  PADDLE_ENFORCE_GE(rank, 0,
                    platform::errors::InvalidArgument(
                        "Expected rank >= 0. But received rank is %d.", rank));
  PADDLE_ENFORCE_LT(
-      rank, world_rank_ids.size(),
+      rank, nranks,
      platform::errors::InvalidArgument(
          "Expected rank < nranks. But received rank is %d, nranks is %d.",
-          rank, world_rank_ids.size()));
+          rank, nranks));
  PADDLE_ENFORCE_GE(
      dev_id, 0,
      platform::errors::InvalidArgument(
          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
-  PADDLE_ENFORCE_GE(
-      ring_id, 0,
-      platform::errors::InvalidArgument(
-          "Expected ring_id >= 0. But received ring_id is %d.", ring_id));
-
-  auto* comm_wrapper = AssignHCCLComm(world_rank_ids.size(), rank, dev_id, ring_id);

-  // HACK(sunpeng17): hcom API requires bind stream to a model
-  // but we don't need model in Paddle, so we feed stream pointer as model pointer
+  HcclComm comm;
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id));
  PADDLE_ENFORCE_NPU_SUCCESS(
-      platform::dynload::hcom_bind_model(comm_wrapper->stream(),
-                                         comm_wrapper->stream()));
+      platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
+
+ VLOG(1) << "initialized comm: " << &comm  << ", nranks: " << nranks << ", hccl_id: " << hccl_id << ", rank: " << rank;

-  // Get world_rank_ids registered in gen_nccl_id op
-  std::string group_name = HCOM_GROUP_PREFIX + std::to_string(ring_id);
-  PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_create_group(
-      group_name.c_str(), world_rank_ids.size(), (unsigned int*)world_rank_ids.data()));
+  auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);

  VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
-          << " has been created on device " << dev_id << ", group name: " << group_name;
+          << " has been created on device " << dev_id << ", with comm: " << comm_wrapper->comm();

  std::call_once(once_flag_, []() {
    std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
@@ -93,7 +97,8 @@ HCCLComm* HCCLCommContext::CreateHCCLComm(const std::vector<int>& world_rank_ids
  return comm_wrapper;
 }

-HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int ring_id) {
+HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id) {
  std::unique_ptr<NPUDeviceContext> dev_ctx(
      new NPUDeviceContext(NPUPlace(dev_id)));

@@ -101,6 +106,7 @@ HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int
  c->set_ring_id(ring_id);
  c->set_nranks(nranks);
  c->set_rank(rank);
+  c->set_comm(comm);
  c->set_dev_ctx(std::move(dev_ctx));

  comm_map_mutex_.lock();
@@ -112,23 +118,14 @@ HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int
  dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
  comm_map_mutex_.unlock();

-  return comm_map_[ring_id][dev_id].get();
-}
-
-void HCCLCommContext::InitHcomWorldGroup() {
-  const char *rank_table_file = getenv(ENV_RANK_TABLE_FILE);
-  PADDLE_ENFORCE_NOT_NULL(
-      rank_table_file,
-      platform::errors::InvalidArgument("The RANK_TABLE_FILE environment variable should not be null."));
-
-  const char *rank_id = getenv(ENV_RANK_ID);
-  PADDLE_ENFORCE_NOT_NULL(
-      rank_id,
-      platform::errors::InvalidArgument("The RANK_ID environment variable should not be null."));
+  if (ring_id == 0) {
+    auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(
+            platform::NPUPlace(dev_id)));
+    dev_ctx->set_hccl_comm(comm);
+  }

-  PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_init(rank_table_file, rank_id));
-  VLOG(3) << "Successfully initialized hcom. rank_table_file: "
-    << rank_table_file << ", rank_id " << rank_id;
+  return comm_map_[ring_id][dev_id].get();
 }

 void HCCLCommContext::ReleaseHCCLComms() {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -185,11 +185,21 @@ class NPUDeviceContext : public DeviceContext {

  void WaitStreamCallback() const { return stream_->WaitCallback(); }

+#if defined(PADDLE_WITH_ASCEND_CL)
+  /*! \brief  Return hccl communicators. */
+  HcclComm hccl_comm() const { return hccl_comm_; }
+
+  /*! \brief  Set hccl communicators. */
+  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
+#endif
+
 private:
  NPUPlace place_;
  aclrtContext context_;
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  HCCLContext_t hccl_context_;
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // HCCLContext_t hccl_context_;
+  HcclComm hccl_comm_{nullptr};
 #endif

  // Need to be the same with other DeviceContext,

--- a/paddle/fluid/platform/dynload/hccl.h
+++ b/paddle/fluid/platform/dynload/hccl.h
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

-// #include <hccl/hccl.h>
-// #include <hccl/hccl_types.h>
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
 #include <mutex>  // NOLINT

 #include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/dynload/hcom.h"
+// #include "paddle/fluid/platform/dynload/hcom.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"

+#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -43,27 +45,14 @@ extern void* hccl_dso_handle;
  extern DynLoad__##__name __name

 #define HCCL_RAND_ROUTINE_EACH(__macro)         \
-  __macro(hcom_init);                           \
-  __macro(hcom_destroy);                        \
-  __macro(hcom_bind_model);                     \
-  __macro(hcom_unbind_model);                   \
-  __macro(hcom_send);                           \
-  __macro(hcom_receive);                        \
-  __macro(hcom_broadcast);                      \
-  __macro(hcom_all_gather);                     \
-  __macro(hcom_all_reduce);                     \
-  __macro(hcom_reduce_scatter);                 \
-  __macro(hcom_create_group);                   \
-  __macro(hcom_destroy_group);                  \
-  __macro(hcom_get_rank_id);                    \
-  __macro(hcom_get_local_rank_id);              \
-  __macro(hcom_get_local_rank_size);            \
-  __macro(hcom_get_split_strategy);             \
-  __macro(hcom_set_split_strategy_by_size);     \
-  __macro(hcom_set_split_strategy_by_index);    \
-  __macro(hcom_get_group_rank_from_world_rank); \
-  __macro(hcom_get_world_rank_from_group_rank); 
-
+  __macro(HcclReduceScatter);                   \
+  __macro(HcclCommDestroy);                     \
+  __macro(HcclAllReduce);                       \
+  __macro(HcclCommInitRootInfo);                \
+  __macro(HcclGetRootInfo);                     \
+  __macro(HcclBroadcast);                       \
+  __macro(HcclCommInitClusterInfo);             \
+  __macro(HcclAllGather);

 HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)


--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -40,7 +40,7 @@ limitations under the License. */

 #ifdef PADDLE_WITH_ASCEND_CL
 #include "acl/acl.h"
-#include "paddle/fluid/platform/dynload/hcom.h"
+#include "hccl/hccl_types.h"
 #endif  // PADDLE_WITH_ASCEND_CL

 #include <fstream>
@@ -1013,7 +1013,7 @@ struct NPUStatusType {};
  }

 DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
-DEFINE_NPU_STATUS_TYPE(hcclResult_t, HCCL_SUCCESS);
+DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
 }  // namespace details

 inline std::string build_npu_error_msg(aclError stat) {
@@ -1022,7 +1022,7 @@ inline std::string build_npu_error_msg(aclError stat) {
  return sout.str();
 }

-inline std::string build_npu_error_msg(hcclResult_t stat) {
+inline std::string build_npu_error_msg(HcclResult stat) {
  std::ostringstream sout;
  sout << " HCCL error, the error code is : " << stat << ". ";
  return sout.str();

--- a/paddle/fluid/platform/hccl_helper.h
+++ b/paddle/fluid/platform/hccl_helper.h
@@ -14,7 +14,7 @@

 #pragma once

-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_ASCEND_CL)
+#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_ASCEND_CL)

 #include <stdio.h>
 #include <memory>
@@ -24,30 +24,22 @@
 #include <unordered_map>
 #include <vector>

-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/platform/collective_helper.h"
-
-#ifdef PADDLE_WITH_NCCL
-#include "paddle/fluid/platform/dynload/nccl.h"
-#endif
-
-#ifdef PADDLE_WITH_RCCL
-#include "paddle/fluid/platform/dynload/rccl.h"
-#endif

 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/dynload/hccl.h"
 #endif

+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"

-#define NCCL_ID_VARNAME "NCCLID"
+#define HCCL_ID_VARNAME "HCCLID"

 namespace paddle {
 namespace platform {

-inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) {
+inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) {
  if (type == framework::proto::VarType::FP32) {
    return HCCL_DATA_TYPE_FP32;
  } else if (type == framework::proto::VarType::FP16) {
@@ -56,7 +48,7 @@ inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) {
    return HCCL_DATA_TYPE_INT32;
  } else if (type == framework::proto::VarType::INT8) {
    return HCCL_DATA_TYPE_INT8;
-  } 
+  }
  // else if (type == framework::proto::VarType::FP64) {
  //   return HCCL_DATA_TYPE_FP32;
  // }
@@ -66,298 +58,301 @@ inline hcclDataType_t ToHCCLDataType(framework::proto::VarType::Type type) {
  }
 }

-// // NOTE(minqiyang): according to the ncclGroupEnd documentations:
-// // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
-// // ncclGroupEnd will wait for all communicators to be initialized, which will
-// // cause blocking problem when a runtime_error was thrown, so try only guard
-// // NCCL actions when use it.
-// class NCCLGroupGuard {
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// HCCL actions when use it.
+
+// class HCCLGroupGuard {
 //  public:
-//   static std::mutex &NCCLMutex() {
+//   static std::mutex &HCCLMutex() {
 //     static std::mutex mtx;
 //     return mtx;
 //   }

-//   inline NCCLGroupGuard() {
-//     NCCLMutex().lock();
+//   inline HCCLGroupGuard() {
+//     HCCLMutex().lock();
 //     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart());
 //   }

-//   inline ~NCCLGroupGuard() PADDLE_MAY_THROW {
+//   inline ~HCCLGroupGuard() PADDLE_MAY_THROW {
 //     PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd());
-//     NCCLMutex().unlock();
+//     HCCLMutex().unlock();
 //   }
 // };

-// struct NCCLContext {
-//   std::unique_ptr<CUDADeviceContext> ctx_;
-//   ncclComm_t comm_;
+struct HCCLContext {
+  std::unique_ptr<NPUDeviceContext> ctx_;
+  HcclComm comm_;

-//   explicit NCCLContext(int dev_id)
-//       : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))), comm_{nullptr} {}
+  explicit HCCLContext(int dev_id)
+      : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {}

-//   gpuStream_t stream() const { return ctx_->stream(); }
-//   ncclComm_t comm() const { return comm_; }
+  aclrtStream stream() const { return ctx_->stream(); }
+  HcclComm comm() const { return comm_; }

-//   int device_id() const {
-//     return BOOST_GET_CONST(platform::CUDAPlace, ctx_->GetPlace()).device;
-//   }
-// };
+  int device_id() const {
+    return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device;
+  }
+};
+
+struct HCCLContextMap {
+  std::unordered_map<int, HCCLContext> contexts_;
+  std::vector<int> order_;
+
+  explicit HCCLContextMap(const std::vector<platform::Place> &places,
+                          HcclRootInfo *hccl_id = nullptr,
+                          size_t num_trainers = 1, size_t trainer_id = 0) {
+    PADDLE_ENFORCE_EQ(!places.empty(), true,
+                      platform::errors::InvalidArgument(
+                          "The HCCL place should not be empty."));
+    order_.reserve(places.size());
+    for (auto &p : places) {
+      int dev_id = BOOST_GET_CONST(NPUPlace, p).device;
+      order_.emplace_back(dev_id);
+      contexts_.emplace(dev_id, HCCLContext(dev_id));
+    }
+    PADDLE_ENFORCE_EQ(
+        order_.size(), contexts_.size(),
+        platform::errors::Unavailable("HCCL Context Map does not support "
+                                      "contain two or more same device."));
+
+    std::unique_ptr<HcclComm[]> comms(new HcclComm[order_.size()]);
+    // if num_trainers == 1, should create a new nccl id for local comms.
+    if (num_trainers == 1 && hccl_id == nullptr) {
+      // we do not know how to tackle this situation under hccl
+      // std::lock_guard<std::mutex> guard(HCCLGroupGuard::HCCLMutex());
+      // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll(
+      //     comms.get(), static_cast<int>(order_.size()), order_.data()));
+    } else {
+      PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument(
+                                           "The HCCL id should not be null."));
+      {
+        int nranks = num_trainers * order_.size();
+        // HCCLGroupGuard gurad;
+        for (size_t i = 0; i < order_.size(); ++i) {
+          int gpu_id = order_[i];
+          int rank;
+          if (order_.size() > 1) {
+            rank = trainer_id * order_.size() + i;
+          } else {
+            rank = trainer_id;
+          }
+          VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks
+                  << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
+          aclrtSetDevice(gpu_id);
+          PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo(
+              nranks, hccl_id, rank, comms.get() + i));
+        }
+      }
+    }
+    int i = 0;
+    for (auto &dev_id : order_) {
+      contexts_.at(dev_id).comm_ = comms[i++];
+    }
+  }

-// struct NCCLContextMap {
-//   std::unordered_map<int, NCCLContext> contexts_;
-//   std::vector<int> order_;
-
-//   explicit NCCLContextMap(const std::vector<platform::Place> &places,
-//                           ncclUniqueId *nccl_id = nullptr,
-//                           size_t num_trainers = 1, size_t trainer_id = 0) {
-//     PADDLE_ENFORCE_EQ(!places.empty(), true,
-//                       platform::errors::InvalidArgument(
-//                           "The NCCL place should not be empty."));
-//     order_.reserve(places.size());
-//     for (auto &p : places) {
-//       int dev_id = BOOST_GET_CONST(CUDAPlace, p).device;
-//       order_.emplace_back(dev_id);
-//       contexts_.emplace(dev_id, NCCLContext(dev_id));
-//     }
-//     PADDLE_ENFORCE_EQ(
-//         order_.size(), contexts_.size(),
-//         platform::errors::Unavailable("NCCL Context Map does not support "
-//                                       "contain two or more same device."));
-
-//     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
-//     // if num_trainers == 1, should create a new nccl id for local comms.
-//     if (num_trainers == 1 && nccl_id == nullptr) {
-//       std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-//       PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
-//           comms.get(), static_cast<int>(order_.size()), order_.data()));
-//     } else {
-//       PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument(
-//                                            "The NCCL id should not be null."));
-//       {
-//         int nranks = num_trainers * order_.size();
-//         NCCLGroupGuard gurad;
-//         for (size_t i = 0; i < order_.size(); ++i) {
-//           int gpu_id = order_[i];
-//           int rank;
-//           if (order_.size() > 1) {
-//             rank = trainer_id * order_.size() + i;
-//           } else {
-//             rank = trainer_id;
-//           }
-//           VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
-//                   << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
-//           SetDeviceId(gpu_id);
-//           PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
-//               comms.get() + i, nranks, *nccl_id, rank));
-//         }
-//       }
-//     }
-//     int i = 0;
-//     for (auto &dev_id : order_) {
-//       contexts_.at(dev_id).comm_ = comms[i++];
-//     }
-//   }
+  HCCLContextMap(const HCCLContextMap &other) = delete;
+  HCCLContextMap &operator=(const HCCLContextMap &other) = delete;

-//   NCCLContextMap(const NCCLContextMap &other) = delete;
-//   NCCLContextMap &operator=(const NCCLContextMap &other) = delete;
+  NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }

-//   CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); }
+  NPUDeviceContext *DevCtx(platform::Place p) const {
+    return DevCtx(BOOST_GET_CONST(NPUPlace, p).device);
+  }

-//   CUDADeviceContext *DevCtx(platform::Place p) const {
-//     return DevCtx(BOOST_GET_CONST(CUDAPlace, p).device);
-//   }
+  const HCCLContext &at(platform::Place p) const {
+    return this->at(BOOST_GET_CONST(NPUPlace, p).device);
+  }

-//   const NCCLContext &at(platform::Place p) const {
-//     return this->at(BOOST_GET_CONST(CUDAPlace, p).device);
-//   }
+  const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }

-//   const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); }
+  void WaitAll() {
+    for (auto &p : contexts_) {
+      p.second.ctx_->Wait();
+    }
+  }
+};

-//   void WaitAll() {
-//     for (auto &p : contexts_) {
-//       p.second.ctx_->Wait();
-//     }
-//   }
-// };
+inline std::string GetFlatHCCLVarName(size_t pos) {
+  if (pos == 0) {
+    return HCCL_ID_VARNAME;
+  }
+  return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast<int>(pos));
+}

-// inline std::string GetFlatNCCLVarName(size_t pos) {
-//   if (pos == 0) {
-//     return NCCL_ID_VARNAME;
-//   }
-//   return string::Sprintf("%s_%d", NCCL_ID_VARNAME, static_cast<int>(pos));
-// }
-
-// inline std::string GetHierarchicalExterNCCLVarName(size_t pos) {
-//   return string::Sprintf("Hierarchical_exter_%s_%d", NCCL_ID_VARNAME,
-//                          static_cast<int>(pos));
-// }
-// inline std::string GetHierarchicalInterNCCLVarName(size_t pos) {
-//   return string::Sprintf("Hierarchical_inter_%s_%d", NCCL_ID_VARNAME,
-//                          static_cast<int>(pos));
-// }
-
-// class NCCLCommunicator {
-//  public:
-//   NCCLCommunicator() {}
-//   virtual ~NCCLCommunicator() PADDLE_MAY_THROW {}
+inline std::string GetHierarchicalExterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}
+inline std::string GetHierarchicalInterHCCLVarName(size_t pos) {
+  return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME,
+                         static_cast<int>(pos));
+}

-//   NCCLContextMap *DefaultFlatCtx() const {
-//     if (flat_ctxs_.size() == 0) {
-//       return nullptr;
-//     }
+class HCCLCommunicator {
+ public:
+  HCCLCommunicator() {}
+  virtual ~HCCLCommunicator() PADDLE_MAY_THROW {}

-//     return flat_ctxs_[0].get();
-//   }
+  HCCLContextMap *DefaultFlatCtx() const {
+    if (flat_ctxs_.size() == 0) {
+      return nullptr;
+    }

-//   std::vector<std::unique_ptr<NCCLContextMap>> *GetFlatCtxs() {
-//     return &flat_ctxs_;
-//   }
+    return flat_ctxs_[0].get();
+  }

-//   NCCLContextMap *GetFlatCtx(size_t run_order) const {
-//     return flat_ctxs_[run_order % flat_ctxs_.size()].get();
-//   }
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetFlatCtxs() {
+    return &flat_ctxs_;
+  }

-//   NCCLContextMap *GetRunEnvNCCLCtx(size_t run_order,
-//                                    bool use_hierarchical_allreduce) const {
-//     if (!use_hierarchical_allreduce) {
-//       return GetFlatCtx(run_order);
-//     }
+  HCCLContextMap *GetFlatCtx(size_t run_order) const {
+    return flat_ctxs_[run_order % flat_ctxs_.size()].get();
+  }

-//     return GetHierarchicalInterCtx(run_order);
-//   }
+  HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order,
+                                   bool use_hierarchical_allreduce) const {
+    if (!use_hierarchical_allreduce) {
+      return GetFlatCtx(run_order);
+    }

-  
-//    *When nccl inits nccl comm using ncclCommInitAll, it meets error when
-//    *allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
-//    *create a new nccl comm for sync_batch_norm_op. And these codes should be
-//    *polished with a unified nccl management.
-  
-//   NCCLContextMap *GetSyncBatchNormCtx(
-//       framework::Scope *scope, const std::vector<platform::Place> &places) {
-//     auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME);
-//     if (nccl_id_var != nullptr) {
-//       return DefaultFlatCtx();
-//     }
-
-//     if (sync_batch_norm_ctx_.get() == nullptr) {
-//       sync_batch_norm_ctx_.reset(new NCCLContextMap(places));
-//     }
-//     return sync_batch_norm_ctx_.get();
-//   }
+    return GetHierarchicalInterCtx(run_order);
+  }

-//   void InitFlatCtxs(const std::vector<platform::Place> &places,
-//                     const std::vector<ncclUniqueId *> &nccl_ids,
-//                     size_t trainers_num, size_t trainer_id) {
-//     if (nccl_ids.size() == 0) {
-//       auto ptr = new platform::NCCLContextMap(places);
-//       VLOG(1) << "init local trainer";
-//       flat_ctxs_.emplace_back(ptr);
-//     } else {
-//       for (size_t i = 0; i < nccl_ids.size(); i++) {
-//         auto ptr = new platform::NCCLContextMap(places, nccl_ids[i],
-//                                                 trainers_num, trainer_id);
-//         VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
-//         flat_ctxs_.emplace_back(ptr);
-//       }
-//     }
-
-//     // as Executor have no way to use ncclComm created by ParallelExecutor,
-//     // we assign all flatten contexts to NCCLCommContext to fix.
-//     int nranks = static_cast<int>(trainers_num * places.size());
-//     int nrings = static_cast<int>(flat_ctxs_.size());
-//     for (int ring_id = 0; ring_id < nrings; ++ring_id) {
-//       for (size_t p = 0; p < places.size(); ++p) {
-//         int rank = trainer_id * places.size() + p;
-//         int dev_id = BOOST_GET_CONST(CUDAPlace, places[p]).device;
-//         auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
-//         NCCLCommContext::Instance().AssignNCCLComm(ctx.comm_, nranks, rank,
-//                                                    dev_id, ring_id);
-//       }
-//     }
-//   }

-//   void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
-//                             const std::vector<ncclUniqueId *> &inter_nccl_ids,
-//                             const std::vector<ncclUniqueId *> &exter_nccl_ids,
-//                             size_t trainers_num, size_t trainer_id,
-//                             size_t inter_trainers_num,
-//                             size_t exter_trainers_num) {
-//     PADDLE_ENFORCE_EQ(
-//         trainers_num, inter_trainers_num * exter_trainers_num,
-//         platform::errors::InvalidArgument(
-//             "trainers_num:%llu != inter_trainers_num:%llu * "
-//             "exter_trainers_num:%llu",
-//             trainers_num, inter_trainers_num, exter_trainers_num));
-
-//     PADDLE_ENFORCE_GT(
-//         inter_trainers_num, 1,
-//         platform::errors::InvalidArgument(
-//             "The inter_trainers_num:%llu should be larger than 1.",
-//             inter_trainers_num));
-
-//     int inter_trainer_id = trainer_id % inter_trainers_num;
-//     for (size_t i = 0; i < inter_nccl_ids.size(); i++) {
-//       VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
-//               << ", comm no:" << i;
-//       auto local = new NCCLContextMap(places, inter_nccl_ids[i],
-//                                       inter_trainers_num, inter_trainer_id);
-
-//       h_inter_ctxs_.emplace_back(local);
-//     }
-
-//     int exter_trainer_id = -1;
-//     if (trainer_id % inter_trainers_num == 0) {
-//       exter_trainer_id = trainer_id / inter_trainers_num;
-//     }
-
-//     if (exter_trainer_id >= 0) {
-//       for (size_t i = 0; i < exter_nccl_ids.size(); i++) {
-//         auto ex = new NCCLContextMap(places, exter_nccl_ids[i],
-//                                      exter_trainers_num, exter_trainer_id);
-//         VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
-//                 << ", comm no:" << i;
-//         h_exter_ctxs_.emplace_back(ex);
-//       }
-//     }
-//   }
+   /*
+    When nccl inits nccl comm using ncclCommInitAll, it meets error when
+    allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So
+    create a new nccl comm for sync_batch_norm_op. And these codes should be
+    polished with a unified nccl management.
+   */

-//   bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }
+  HCCLContextMap *GetSyncBatchNormCtx(framework::Scope* scope, const std::vector<platform::Place> &places) {
+    auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME);
+    if (hccl_id_var != nullptr) {
+      return DefaultFlatCtx();
+    }

-//   NCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
-//     PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
-//                       platform::errors::InvalidArgument(
-//                           "Hierarchical ctxs should be initialized firstly!"));
-//     return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
-//   }
+    if (sync_batch_norm_ctx_.get() == nullptr) {
+      sync_batch_norm_ctx_.reset(new HCCLContextMap(places));
+    }
+    return sync_batch_norm_ctx_.get();
+  }

-//   NCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
-//     PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
-//                       platform::errors::InvalidArgument(
-//                           "Hierarchical ctxs should be initialized firstly!"));
-//     return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
-//   }
+  void InitFlatCtxs(const std::vector<platform::Place> &places,
+                    const std::vector<HcclRootInfo *> &hccl_ids,
+                    size_t trainers_num, size_t trainer_id) {
+    if (hccl_ids.size() == 0) {
+      auto ptr = new platform::HCCLContextMap(places);
+      VLOG(1) << "init local trainer";
+      flat_ctxs_.emplace_back(ptr);
+    } else {
+      for (size_t i = 0; i < hccl_ids.size(); i++) {
+        auto ptr = new platform::HCCLContextMap(places, hccl_ids[i],
+                                                trainers_num, trainer_id);
+        VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i;
+        flat_ctxs_.emplace_back(ptr);
+      }
+    }
+
+    // as Executor have no way to use ncclComm created by ParallelExecutor,
+    // we assign all flatten contexts to HCCLCommContext to fix.
+    int nranks = static_cast<int>(trainers_num * places.size());
+    int nrings = static_cast<int>(flat_ctxs_.size());
+    for (int ring_id = 0; ring_id < nrings; ++ring_id) {
+      for (size_t p = 0; p < places.size(); ++p) {
+        int rank = trainer_id * places.size() + p;
+        int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device;
+        auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id);
+        HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank,
+                                                   dev_id, ring_id);
+      }
+    }
+  }

-//   std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalInterCtxs() {
-//     return &h_inter_ctxs_;
-//   }
+  void InitHierarchicalCtxs(const std::vector<platform::Place> &places,
+                            const std::vector<HcclRootInfo *> &inter_hccl_ids,
+                            const std::vector<HcclRootInfo *> &exter_hccl_ids,
+                            size_t trainers_num, size_t trainer_id,
+                            size_t inter_trainers_num,
+                            size_t exter_trainers_num) {
+    PADDLE_ENFORCE_EQ(
+        trainers_num, inter_trainers_num * exter_trainers_num,
+        platform::errors::InvalidArgument(
+            "trainers_num:%llu != inter_trainers_num:%llu * "
+            "exter_trainers_num:%llu",
+            trainers_num, inter_trainers_num, exter_trainers_num));
+
+    PADDLE_ENFORCE_GT(
+        inter_trainers_num, 1,
+        platform::errors::InvalidArgument(
+            "The inter_trainers_num:%llu should be larger than 1.",
+            inter_trainers_num));
+
+    int inter_trainer_id = trainer_id % inter_trainers_num;
+    for (size_t i = 0; i < inter_hccl_ids.size(); i++) {
+      VLOG(1) << "init inter_trainer_id:" << inter_trainer_id
+              << ", comm no:" << i;
+      auto local = new HCCLContextMap(places, inter_hccl_ids[i],
+                                      inter_trainers_num, inter_trainer_id);
+
+      h_inter_ctxs_.emplace_back(local);
+    }
+
+    int exter_trainer_id = -1;
+    if (trainer_id % inter_trainers_num == 0) {
+      exter_trainer_id = trainer_id / inter_trainers_num;
+    }
+
+    if (exter_trainer_id >= 0) {
+      for (size_t i = 0; i < exter_hccl_ids.size(); i++) {
+        auto ex = new HCCLContextMap(places, exter_hccl_ids[i],
+                                     exter_trainers_num, exter_trainer_id);
+        VLOG(1) << "init exter_trainer_id:" << exter_trainer_id
+                << ", comm no:" << i;
+        h_exter_ctxs_.emplace_back(ex);
+      }
+    }
+  }

-//   std::vector<std::unique_ptr<NCCLContextMap>> *GetHierarchicalExterCtxs() {
-//     return &h_exter_ctxs_;
-//   }
+  bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; }

-//  protected:
-//   // Support multi nccl comm on default nccl ring while NCCLContextMap can't.
-//   std::vector<std::unique_ptr<NCCLContextMap>> flat_ctxs_;
+  HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get();
+  }

-//   // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
-//   // And h_exter_ctxs_ can support multi comm too.
-//   std::vector<std::unique_ptr<NCCLContextMap>> h_inter_ctxs_;
-//   std::vector<std::unique_ptr<NCCLContextMap>> h_exter_ctxs_;
+  HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const {
+    PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0,
+                      platform::errors::InvalidArgument(
+                          "Hierarchical ctxs should be initialized firstly!"));
+    return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get();
+  }

-//   // just used for sync_batch_norm op.
-//   std::unique_ptr<NCCLContextMap> sync_batch_norm_ctx_;
-// };
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalInterCtxs() {
+    return &h_inter_ctxs_;
+  }
+
+  std::vector<std::unique_ptr<HCCLContextMap>> *GetHierarchicalExterCtxs() {
+    return &h_exter_ctxs_;
+  }
+
+ protected:
+  // Support multi nccl comm on default nccl ring while HCCLContextMap can't.
+  std::vector<std::unique_ptr<HCCLContextMap>> flat_ctxs_;
+
+  // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce.
+  // And h_exter_ctxs_ can support multi comm too.
+  std::vector<std::unique_ptr<HCCLContextMap>> h_inter_ctxs_;
+  std::vector<std::unique_ptr<HCCLContextMap>> h_exter_ctxs_;
+
+  // just used for sync_batch_norm op.
+  std::unique_ptr<HCCLContextMap> sync_batch_norm_ctx_;
+};

 }  // namespace platform
 }  // namespace paddle

--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -74,11 +74,11 @@ def _get_ascend_rankfile(rank_table_file_path):
        node_ips.append(server['server_id'])
        device_list = server['device']
        device_count = len(device_list)
-            
+
    return node_ips, device_count

-def get_cloud_cluster(rank_table_file=None, 
-                    device_mode=DeviceMode.ASCEND_NPU, 
+def get_cloud_cluster(rank_table_file=None,
+                    device_mode=DeviceMode.ASCEND_NPU,
                    start_port=6070):
    """
    Args:
@@ -86,7 +86,7 @@ def get_cloud_cluster(rank_table_file=None,
    device_mode: DeviceMode(Int)
    start_port: the start port of current runtime env
    """
-    if rank_table_file: 
+    if rank_table_file:
        # multi trainers
        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
        if len(node_ips) == 1:
@@ -106,7 +106,7 @@ def get_cloud_cluster(rank_table_file=None,
        node_ips = ["127.0.0.1"]
        node_ip = node_ips[0]
        device_count = 1
-        
+
    devices_per_proc = [str(x) for x in range(device_count)]
    free_ports = [
        x for x in range(start_port, start_port + len(devices_per_proc))
@@ -117,4 +117,4 @@ def get_cloud_cluster(rank_table_file=None,
        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])

    return get_cluster(node_ips, node_ip, trainer_endpoints,
-                               device_mode, devices_per_proc)
\ No newline at end of file
+                               device_mode, devices_per_proc)
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -151,17 +151,30 @@ class CollectiveHelper(object):
                    OP_ROLE_KEY: OpRole.Forward
                })
        elif core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
            block.append_op(
-                type='c_comm_init_hcom',
+                type='c_gen_hccl_id',
                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
                outputs={},
                attrs={
-                    'nranks': nranks,
                    'rank': rank,
                    'ring_id': ring_id,
                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': [endpoint_to_index_map[e] for e in endpoints],
+                    'rank_ids': nranks,
                    OP_ROLE_KEY: OpRole.Forward
                })


--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -108,19 +108,32 @@ class PipelineHelper(object):
                    OP_ROLE_KEY: OpRole.Forward,
                })
        elif core.is_compiled_with_npu():
-            endpoint_to_index_map = {
-                e: idx for idx, e in enumerate(endpoints)
-            }
+            if rank == 0 and wait_port:
+                wait_server_ready(other_endpoints)
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
            block.append_op(
-                type='c_comm_init_hcom',
+                type='c_gen_hccl_id',
                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
                outputs={},
                attrs={
-                    'nranks': nranks,
                    'rank': rank,
                    'ring_id': ring_id,
                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': [endpoint_to_index_map[e] for e in endpoints],
+                    'rank_ids': nranks,
                    OP_ROLE_KEY: OpRole.Forward
                })


--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -318,12 +318,12 @@ def _set_expected_place(place):

 # TODO(zhiqiu): remove this function.
 def _var_base_to_np(var_base):
-    """	
-    convert VarBase tp numpy	
+    """
+    convert VarBase tp numpy

-    Args:	
-        var_base(VarBase) : the VarBase to convert	
-    Returns (np.ndarray): the np.ndarray contain the value of VarBase	
+    Args:
+        var_base(VarBase) : the VarBase to convert
+    Returns (np.ndarray): the np.ndarray contain the value of VarBase
    """

    warnings.warn(
@@ -413,7 +413,7 @@ def cuda_places(device_ids=None):

    If :code:`device_ids` is not None, it should be the device
    ids of GPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
    [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].

    Parameters:
@@ -455,19 +455,19 @@ def xpu_places(device_ids=None):
    xpu places would be returned.
    If :code:`device_ids` is not None, it should be the device
    ids of XPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
    [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
-    
+
    Parameters:
        device_ids (list or tuple of int, optional): list of XPU device ids.
    Returns:
        list of paddle.XPUPlace: Created XPU place list.
    Examples:
        .. code-block:: python
-        
+
            import paddle
            import paddle.static as static
-            
+
            paddle.enable_static()
            xpu_places = static.xpu_places()
    """
@@ -485,7 +485,7 @@ def cpu_places(device_count=None):
    This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.

    If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
    If :code:`CPU_NUM` is not set, the default value is 1,
    i.e. CPU_NUM=1.
    :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -518,7 +518,7 @@ def cuda_pinned_places(device_count=None):
    This function creates a list of :code:`fluid.CUDAPinnedPlace` objects.

    If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
    If :code:`CPU_NUM` is not set, the default value is 1,
    i.e. CPU_NUM=1.
    :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -579,7 +579,7 @@ def name_scope(prefix=None):

    Generate hierarchical name prefix for the operators in Static Graph.

-    Note: 
+    Note:
        This should only used for debugging and visualization purpose.
        Don't use it for serious analysis such as graph/program transformations.
        Don't use it in dygraph, since it will cause memory leak.
@@ -604,7 +604,7 @@ def name_scope(prefix=None):
          with paddle.static.name_scope("s4"):
                g = f - 1

-          # Op are created in the default main program.  
+          # Op are created in the default main program.
          for op in paddle.static.default_main_program().block(0).ops:
              # elementwise_add is created in /s1/
              if op.type == 'elementwise_add':
@@ -1634,7 +1634,7 @@ class Variable(object):
    def clone(self):
        """
        Returns a new static Variable, which is the clone of the original static
-        Variable. It remains in the current graph, that is, the cloned Variable 
+        Variable. It remains in the current graph, that is, the cloned Variable
        provides gradient propagation. Calling ``out = tensor.clone()`` is same
        as ``out = assign(tensor)`` .

@@ -1685,7 +1685,7 @@ class Variable(object):
            key(str): Key for this information.
            value(object): The value associated to the key.

-        Returns: 
+        Returns:
            None
        """
        if not hasattr(self, "_info"):
@@ -1699,7 +1699,7 @@ class Variable(object):
        Args:
            key(str): Key for this information.

-        Returns: 
+        Returns:
            object
        """
        if hasattr(self, "_info") and key in self._info:
@@ -2053,7 +2053,7 @@ class Operator(object):
        'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
        'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
-        'gen_nccl_id', 'c_gen_nccl_id', 'c_comm_init', 'c_comm_init_hcom', 'c_sync_calc_stream',
+        'gen_nccl_id', 'c_gen_nccl_id', 'c_gen_hccl_id', 'c_comm_init', 'c_comm_init_hccl', 'c_sync_calc_stream',
        'c_sync_comm_stream', 'queue_generator', 'dequeue', 'enqueue',
        'heter_listen_and_serv'
    }
@@ -3043,7 +3043,7 @@ class Block(object):

    def _insert_op_without_sync(self, index, *args, **kwargs):
        """
-        Insert an Operator according to the giving arguments, 
+        Insert an Operator according to the giving arguments,
        without sync_with_cpp to meke the compilation faster.

        Args:
@@ -4455,8 +4455,8 @@ class Program(object):
    def clone(self, for_test=False):
        """
        .. note:::
-            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` . 
-            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` . 
+            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` .
+            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` .
            3. This API has no effect in Dygraph Mode.

        Create a new Program with forward content of original one when ``for_test=True``.
@@ -4684,8 +4684,8 @@ class Program(object):
    def _prune_with_input(self, feeded_var_names, targets):
        """
        Prune operators and variables which are not needed to generate
-        :code:`targets`. Prune operators and variables which are needed 
-        to generate feeded_var 
+        :code:`targets`. Prune operators and variables which are needed
+        to generate feeded_var

        Notes: This is a very low level API. Users should not use this API
        directly. This API is in flux and not stable.
@@ -4830,7 +4830,7 @@ class Program(object):
    def parse_from_string(binary_str):
        """
        .. note::
-            1. All information about parameters will be lost after serialization; 
+            1. All information about parameters will be lost after serialization;
            2. This API has no effect in Dygraph mode.

        Deserialize a Program from  `protobuf <https://en.wikipedia.org/wiki/Protocol_Buffers>`_  binary string.
@@ -4895,7 +4895,7 @@ class Program(object):
        The default random seed for random operators in Program. ``0`` means get
        the random seed from random device.

-        .. note:: 
+        .. note::
            It must be set before the operators have been added.

        Returns:
@@ -4933,7 +4933,7 @@ class Program(object):
        """
        The number of :ref:`api_guide_Block_en`  in this Program.

-        .. note:: 
+        .. note::
            This API has no effect in Dygraph mode.

        Returns:
@@ -5141,8 +5141,8 @@ class Program(object):
        Args:
            other(Program): Other program
            pruned_origin_block_id_map(dict{int:int}): A dict which maps the block id in program
-            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is 
-            cloned from block 0 in other, etc. Default is None, which means default mapped, 
+            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is
+            cloned from block 0 in other, etc. Default is None, which means default mapped,
            {0:0, 1:1,..., n:n}.

        Returns:
@@ -5261,7 +5261,7 @@ class Parameter(Variable):
            be applied on the parameter. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this parameter.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
            in optimizer. Default is True.
    """

@@ -5349,8 +5349,8 @@ class Parameter(Variable):

 class ParamBase(core.VarBase):
    """
-    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode). 
-    A ParamBase is a persistable Tensor, and will be updated by optimizers 
+    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode).
+    A ParamBase is a persistable Tensor, and will be updated by optimizers
    after each iteration.
    The training of a neural network is essentially the updating of
    its ParamBase.
@@ -5368,7 +5368,7 @@ class ParamBase(core.VarBase):
            be applied on the ParamBase. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this ParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
            in optimizer. Default is True.
    """

@@ -5491,7 +5491,7 @@ def default_startup_program():
    Get default/global startup program.

    The :code:`paddle.nn` function will append the initialization operators into startup program.
-    The :code:`startup_program` will initialize the parameters by the OPs. 
+    The :code:`startup_program` will initialize the parameters by the OPs.

    This method will return the default or the current startup program. Users can use
    :ref:`api_paddle_fluid_framework_program_guard`  to switch :ref:`api_paddle_fluid_framework_Program` .
@@ -5499,7 +5499,7 @@ def default_startup_program():
    Returns:
        Program: current default startup program.

-    Returns type: 
+    Returns type:

    Examples:
        .. code-block:: python
@@ -5517,13 +5517,13 @@ def default_startup_program():

 def default_main_program():
    """
-    This API can be used to get ``default main program`` which store the 
+    This API can be used to get ``default main program`` which store the
    descriptions of Ops and tensors.

-    For example ``z = paddle.add(x, y)`` will create a new ``add`` 
-    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` . 
+    For example ``z = paddle.add(x, y)`` will create a new ``add``
+    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` .

-    The ``default main program`` is the default value for ``Program`` parameter in 
+    The ``default main program`` is the default value for ``Program`` parameter in
    a lot of APIs. For example, the :code:`Executor.run()` will execute the
    :code:`default_main_program` when the program is not specified.

@@ -5593,8 +5593,8 @@ def program_guard(main_program, startup_program=None):

    Args:
        main_program(Program): New main program inside ``with`` statement.
-        startup_program(Program, optional): New startup program inside ``with`` 
-            statement. :code:`None` means not changing startup program, 
+        startup_program(Program, optional): New startup program inside ``with``
+            statement. :code:`None` means not changing startup program,
            default_startup_program is still used.
            Default: None.


--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -131,19 +131,32 @@ class Collective(object):
                    self.op_role_key: OpRole.Forward
                })
        elif core.is_compiled_with_npu():
-            endpoint_to_index_map = {
-                e: idx for idx, e in enumerate(endpoints)
-            }
+            if rank == 0 and wait_port:
+                wait_server_ready(other_endpoints)
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
            block.append_op(
-                type='c_comm_init_hcom',
+                type='c_gen_hccl_id',
                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
                outputs={},
                attrs={
-                    'nranks': nranks,
                    'rank': rank,
                    'ring_id': ring_id,
                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': [endpoint_to_index_map[e] for e in endpoints],
+                    'rank_ids': nranks,
                    self.op_role_key: OpRole.Forward
                })


--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -162,19 +162,33 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
                'ring_id': 0,
            })
    elif core.is_compiled_with_npu():
-        endpoint_to_index_map = {
-            e: idx for idx, e in enumerate(endpoints)
-        }
+        if rank == 0 and wait_port:
+            wait_server_ready(other_endpoints)
+        hccl_id_var = block.create_var(
+            name=unique_name.generate('hccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
        block.append_op(
-            type='c_comm_init_hcom',
+            type='c_gen_hccl_id',
            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+                OP_ROLE_KEY: OpRole.Forward
+            })
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
            outputs={},
            attrs={
-                'nranks': nranks,
                'rank': rank,
-                'ring_id': 0,
+                'ring_id': ring_id,
                'device_id': int(os.getenv("FLAGS_selected_npus")),
-                'rank_ids': [endpoint_to_index_map[e] for e in endpoints],
+                'rank_ids': nranks,
+                OP_ROLE_KEY: OpRole.Forward
            })


@@ -852,26 +866,26 @@ class Model(object):
          import paddle.nn as nn
          import paddle.vision.transforms as T
          from paddle.static import InputSpec
-  
+
          device = paddle.set_device('cpu') # or 'gpu'
-  
+
          net = nn.Sequential(
              nn.Flatten(1),
              nn.Linear(784, 200),
              nn.Tanh(),
              nn.Linear(200, 10))
-  
+
          # inputs and labels are not required for dynamic graph.
          input = InputSpec([None, 784], 'float32', 'x')
          label = InputSpec([None, 1], 'int64', 'label')
-          
+
          model = paddle.Model(net, input, label)
          optim = paddle.optimizer.SGD(learning_rate=1e-3,
              parameters=model.parameters())
          model.prepare(optim,
                        paddle.nn.CrossEntropyLoss(),
                        paddle.metric.Accuracy())
-          
+
          transform = T.Compose([
              T.Transpose(),
              T.Normalize([127.5], [127.5])
@@ -914,12 +928,12 @@ class Model(object):
        Run one training step on a batch of data.

        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be 
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
-                (in case the model has multiple labels). If has no labels, 
+            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
+                (in case the model has multiple labels). If has no labels,
                set None. Default is None.

        Returns:
@@ -930,7 +944,7 @@ class Model(object):
        Examples:

            .. code-block:: python
-            
+
              import numpy as np
              import paddle
              import paddle.nn as nn
@@ -965,12 +979,12 @@ class Model(object):
        Run one evaluating step on a batch of data.

        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be 
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
-                (in case the model has multiple labels). If has no labels, 
+            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
+                (in case the model has multiple labels). If has no labels,
                set None. Default is None.

        Returns:
@@ -981,7 +995,7 @@ class Model(object):
        Examples:

            .. code-block:: python
-            
+
              import numpy as np
              import paddle
              import paddle.nn as nn
@@ -1017,8 +1031,8 @@ class Model(object):
        Run one predicting step on a batch of data.

        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).

        Returns:
@@ -1028,14 +1042,14 @@ class Model(object):
        Examples:

            .. code-block:: python
-            
+
              import numpy as np
              import paddle
              import paddle.nn as nn
              from paddle.static import InputSpec

              device = paddle.set_device('cpu') # or 'gpu'
-              
+
              input = InputSpec([None, 784], 'float32', 'x')
              label = InputSpec([None, 1], 'int64', 'label')

@@ -1057,12 +1071,12 @@ class Model(object):
        return loss

    def save(self, path, training=True):
-        """  
-        This function saves parameters, optimizer information or model and 
+        """
+        This function saves parameters, optimizer information or model and
        paramters only for inference to path. It depends on the parameter
        `training`.

-        If `training` is set to True, the parameters saved contain all 
+        If `training` is set to True, the parameters saved contain all
        the trainable Variable, will save to a file with suffix ".pdparams".
        The optimizer information contains all the variable used by optimizer.
        For Adam optimizer, contains beta1, beta2, momentum etc. All the
@@ -1115,13 +1129,13 @@ class Model(object):
                optim = paddle.optimizer.SGD(learning_rate=1e-3,
                    parameters=model.parameters())
                model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                
+
                transform = T.Compose([
                    T.Transpose(),
                    T.Normalize([127.5], [127.5])
                ])
                data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
-                
+
                model.fit(data, epochs=1, batch_size=32, verbose=0)
                model.save('checkpoint/test')  # save for training
                model.save('inference_model', False)  # save for inference
@@ -1165,7 +1179,7 @@ class Model(object):
        Examples:

            .. code-block:: python
-            
+
              import paddle
              import paddle.nn as nn
              from paddle.static import InputSpec
@@ -1247,7 +1261,7 @@ class Model(object):
              from paddle.static import InputSpec

              input = InputSpec([None, 784], 'float32', 'x')
-              
+
              model = paddle.Model(nn.Sequential(
                  nn.Linear(784, 200),
                  nn.Tanh(),
@@ -1332,12 +1346,12 @@ class Model(object):
        evaluation will be done at the end of each epoch.

        Args:
-            train_data (Dataset|DataLoader): An iterable data loader is used for 
-                train. An instance of paddle paddle.io.Dataset or 
+            train_data (Dataset|DataLoader): An iterable data loader is used for
+                train. An instance of paddle paddle.io.Dataset or
                paddle.io.Dataloader is recomended. Default: None.
            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.io.Dataset or paddle.io.Dataloader 
+                evaluation at the end of epoch. If None, will not do evaluation.
+                An instance of paddle.io.Dataset or paddle.io.Dataloader
                is recomended. Default: None.
            batch_size (int): Integer number. The batch size of train_data
                and eval_data. When train_data and eval_data are both the
@@ -1394,10 +1408,10 @@ class Model(object):
              ])
              train_dataset = MNIST(mode='train', transform=transform)
              val_dataset = MNIST(mode='test', transform=transform)
-           
+
              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
              label = InputSpec([None, 1], 'int64', 'label')
-           
+
              model = paddle.Model(
                  paddle.vision.models.LeNet(),
                  input, label)
@@ -1426,7 +1440,7 @@ class Model(object):
              dynamic = True
              if not dynamic:
                  paddle.enable_static()
-              
+
              transform = T.Compose([
                    T.Transpose(),
                    T.Normalize([127.5], [127.5])
@@ -1437,10 +1451,10 @@ class Model(object):
              val_dataset = MNIST(mode='test', transform=transform)
              val_loader = paddle.io.DataLoader(val_dataset,
                  batch_size=64)
-           
+
              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
              label = InputSpec([None, 1], 'int64', 'label')
-           
+
              model = paddle.Model(
                  paddle.vision.models.LeNet(), input, label)
              optim = paddle.optimizer.Adam(
@@ -1541,7 +1555,7 @@ class Model(object):

        Args:
            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.io.Dataset or 
+                evaluation. An instance of paddle.io.Dataset or
                paddle.io.Dataloader is recomended.
            batch_size (int): Integer number. The batch size of train_data
                and eval_data.  When eval_data is the instance of Dataloader,
@@ -1638,7 +1652,7 @@ class Model(object):
            batch_size (int): Integer number. The batch size of train_data and eval_data.
                When train_data and eval_data are both the instance of Dataloader, this
                argument will be ignored. Default: 1.
-            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
+            num_workers (int): The number of subprocess to load data, 0 for no subprocess
                used and loading data in main process. When train_data and eval_data are
                both the instance of Dataloader, this argument will be ignored. Default: 0.
            stack_outputs (bool): Whether stack output field like a batch, as for an output
@@ -1857,10 +1871,10 @@ class Model(object):
        """Prints a string summary of the network.

        Args:
-            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
-                    if not set, input_size will get from ``self._inputs`` if network only have 
-                    one input, input_size can be tuple or InputSpec. if model have multiple 
-                    input, input_size must be a list which contain every input's shape. 
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
+                    if not set, input_size will get from ``self._inputs`` if network only have
+                    one input, input_size can be tuple or InputSpec. if model have multiple
+                    input, input_size must be a list which contain every input's shape.
                    Default: None.
            dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.

@@ -1872,10 +1886,10 @@ class Model(object):

              import paddle
              from paddle.static import InputSpec
-           
+
              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
              label = InputSpec([None, 1], 'int64', 'label')
-           
+
              model = paddle.Model(paddle.vision.LeNet(),
                  input, label)
              optim = paddle.optimizer.Adam(