update Ascendrc hccl to 20.3 (#32126)

update Ascendrc hccl to 20.3 (#32126)

update Ascendrc hccl to 20.3 (#32126)
ce26f882 · lw921014 · GitHub · 75dd8423 · ce26f882 · ce26f882
37 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -400,6 +400,7 @@ OperatorBase::OperatorBase(const std::string& type,
  // framework::OpRegistry::CreateOp(type, {}, {}, {}, false).
  // Inputs, outputs and attrs will be set to empty map
  // to improve the execution efficiency of dygraph.
  if (inputs_.size() > 0 || outputs_.size() > 0) {
    GenerateTemporaryNames();
    CheckAllInputOutputSet();

--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -31,6 +31,11 @@
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+#include <hccl/hccl.h>
+#include <hccl/hccl_types.h>
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
 #include "xpu/bkcl.h"
 #endif
@@ -45,6 +50,10 @@ class Communicator;
 class NCCLCommunicator;
 #endif
 #endif
+#ifdef PADDLE_WITH_ASCEND_CL
+class Communicator;
+class HCCLCommunicator;
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
 class BKCLCommunicator;
@@ -157,6 +166,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
 #endif
    operators::CudnnRNNCache,
 #endif
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo,
+#endif
 #if defined(PADDLE_WITH_XPU_BKCL)
    BKCLUniqueId, platform::BKCLCommunicator,
 #endif

--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -11,7 +11,7 @@ foreach(src ${OPS})
    set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
-register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
+register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 if(WITH_NCCL)
    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper)
@@ -24,39 +24,43 @@ if(WITH_GLOO)
 endif()
 if(WITH_XPU_BKCL)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper nccl_common)
    op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
    op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 if(WITH_ASCEND_CL)
-set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+    cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope)
+    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper)
+    op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
+    op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
 if(WITH_ASCEND_CL)
-    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hcom_op op_registry ascend_hccl flags
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper
+        gen_hccl_id_op op_registry ascend_hccl flags
        dynamic_loader dynload_warpctc scope device_context enforce executor)
    cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
        DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
-        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
-            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
        DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
        DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc
+            DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
-    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc 
+    cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc
-        DEPS op_registry c_broadcast_op c_comm_init_hcom_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+        DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
-    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc 
+    cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc
        DEPS op_registry elementwise_add_op c_sync_calc_stream_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
 endif()
--- a/paddle/fluid/operators/collective/c_allgather_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -31,20 +31,19 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_ASCEND_CL)
    auto in = ctx.Input<framework::Tensor>("X");
    auto out = ctx.Output<framework::Tensor>("Out");
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
    int ring_id = ctx.Attr<int>("ring_id");
    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
    auto place = ctx.GetPlace();
    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
    int nranks = comm->nranks();
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
    framework::DDim out_dims = in->dims();
    out_dims[0] *= nranks;
    out->mutable_data<T>(out_dims, place);
-    int64_t send_numel = in->numel();
+    uint64_t send_numel = in->numel();
    void *send_buff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
    void *recv_buff = reinterpret_cast<void*>(out->data<T>());
@@ -59,12 +58,11 @@ class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
    VLOG(3) << "begin hccl allgather, parameter is: "
      << ", group is " << group
      << ", ring_id is " << ring_id
-      << ", nranks is " << nranks
+      << ", nranks is " << nranks;
-      << ", tag is " << tag;
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_gather(
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather(
-        tag.c_str(), send_buff, recv_buff, (u64)send_numel, dtype,
+        send_buff, recv_buff, send_numel, dtype,
-        group.c_str(), (void*)stream));
+        comm->comm(), (void*)stream));
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(c_allgather);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allgather, NPU);
 DECLARE_string(selected_npus);
@@ -56,26 +58,68 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
+  VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+  VLOG(2) << "rank_id = " << rank_id
-          << "; rank_id = " << rank_id
+  << "; device_id = " << device_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+  VLOG(3) << "break";
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+  memcpy(hccl_id, id, 1024);
+}
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+  memcpy(id, hccl_id, 1024);
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -83,7 +127,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
 void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  std::vector<float> init;
@@ -102,7 +146,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();
  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -110,12 +154,12 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // run
  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
+  attrs["tag"]=std::string("tagx");
-  attrs["ring_id"] = 0;
+  attrs["ring_id"]=0;
-  attrs["nranks"] = 2;
+  attrs["nranks"]=2;
-  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+                              {{"Out", {"OutData"}}}, attrs);
  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
@@ -139,11 +183,12 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(c_allgather, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
-  TestHCCLAllGatherOp(&scope, *ctx);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllGatherOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(c_allreduce_max);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
 DECLARE_string(selected_npus);
@@ -59,23 +61,65 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+  VLOG(2) << "rank_id = " << rank_id
-          << "; rank_id = " << rank_id
+  << "; device_id = " << device_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+  VLOG(3) << "break";
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+  memcpy(hccl_id, id, 1024);
+}
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+  memcpy(id, hccl_id, 1024);
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -83,7 +127,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  std::vector<float> init;
@@ -102,7 +146,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();
  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -113,8 +157,8 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
  attrs["tag"] = std::string("tagx");
  attrs["ring_id"] = 0;
-  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"X"}}},
+  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+                              {{"Out", {"OutData"}}}, attrs);
  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
@@ -135,11 +179,12 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(c_allreduce_max, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
-  TestHCCLAllReduceOp(&scope, *ctx);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllReduceOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -117,34 +117,18 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    // we need to pre-allocate 512 Bytes before the data
-    // and 512 Bytes after the data, so the hccl allreduce
-    // can work. This is a must acooding to huawei peer.
-    #define PRE_MALLOC_SIZE_BYTES 512
    auto in = ctx.Input<framework::LoDTensor>("X");
    auto out = ctx.Output<framework::LoDTensor>("Out");
    auto place = ctx.GetPlace();
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
    int64_t numel = in->numel();
-    int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T);
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
-    int64_t tmp_numel = numel + pre_tmp_size * 2;
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
-    paddle::framework::LoDTensor tmp_in, tmp_out;
-    tmp_in.Resize({tmp_numel});
-    tmp_out.Resize({tmp_numel});
-    auto p_tmp_in = tmp_in.mutable_data<T>(place);  // allocate
-    auto p_tmp_out = tmp_out.mutable_data<T>(place);  // allocate
-    void* sendbuff = reinterpret_cast<void*>(tmp_in.data<T>() + pre_tmp_size);
-    void* recvbuff = reinterpret_cast<void*>(tmp_out.data<T>() + pre_tmp_size);
    int ring_id = ctx.Attr<int>("ring_id");
    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
    auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
    aclrtStream stream = nullptr;
    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
@@ -154,33 +138,22 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
      stream = comm->stream();
    }
-    // we need to memset this memory firstly to avoid core by hccl
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_in), 0, tmp_numel*sizeof(T), stream);
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_out), 0, tmp_numel*sizeof(T), stream);
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
-    memory::Copy(npu_place, sendbuff,
-                 npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
-                 numel * sizeof(T),
-                 stream);
-    hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
    switch (red_type) {
      case kRedSum:
-        hccl_red_type = HCCL_REP_OP_SUM;
+        hccl_red_type = HCCL_REDUCE_SUM;
        break;
      case kRedMax:
-        hccl_red_type = HCCL_REP_OP_MAX;
+        hccl_red_type = HCCL_REDUCE_MAX;
        break;
      case kRedMin:
-        hccl_red_type = HCCL_REP_OP_MIN;
+        hccl_red_type = HCCL_REDUCE_MIN;
        break;
      case kRedProd:
-        hccl_red_type = HCCL_REP_OP_PROD;
+        hccl_red_type = HCCL_REDUCE_PROD;
        break;
      default:
@@ -192,16 +165,10 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
      << "input num: " << numel
      << "dtype: " << dtype
      << "hccl_red_type: " << hccl_red_type
-      << ", group is: " << group
+      << ", group is: " << group;
-      << ", tag is " << tag;
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
-        tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
-    memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-                 npu_place, recvbuff,
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), (void*)stream));
-                 numel * sizeof(T),
-                 stream);
    out->Resize(in->dims());
 #else

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(c_allreduce_sum);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
 DECLARE_string(selected_npus);
@@ -56,23 +58,65 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+  VLOG(2) << "rank_id = " << rank_id
-          << "; rank_id = " << rank_id
+  << "; device_id = " << device_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+  VLOG(3) << "break";
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+  memcpy(hccl_id, id, 1024);
+}
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+  memcpy(id, hccl_id, 1024);
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -81,7 +125,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
                         int iter) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  int rank_id = atoi(getenv("RANK_ID"));
@@ -100,7 +144,7 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
  tensor_x->Resize({num1, num2});
  ctx.Wait();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -111,8 +155,10 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
  attrs["ring_id"] = 0;
-  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"X"}}},
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum",
-                                    {{"Out", {"Out"}}}, attrs);
+                                    {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}},
+                                    attrs);
  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
@@ -133,14 +179,17 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
 TEST(c_allreduce_sum, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
+  // auto* ctx = p::DeviceContextPool::Instance().Get(
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+      // p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 1; i++) {
+  Prepare(&scope, ctx, &hccl_id);
+  for(int i = 0; i < 1; i ++){
    VLOG(2) << "iter num: " << i;
-    TestHCCLAllReduceOp(&scope, *ctx, i);
+    TestHCCLAllReduceOp(&scope, ctx, i);
  }
 }
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -30,7 +30,7 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
    auto x = ctx.Input<framework::LoDTensor>("X");
    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
    int numel = x->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
    auto out = ctx.Output<framework::LoDTensor>("Out");
@@ -48,14 +48,12 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
    int root = ctx.Attr<int>("root");
    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
    VLOG(3) << "begin hccl broadcast, parameter is: "<< "root " << root
-      << ", group is " << group
+      << ", group is " << group << ", comm: " << comm->comm() << ", stream: " << stream;
-      << ", tag is " << tag;
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
-                                  dtype, (uint32_t)root, group.c_str(), (void*)stream));
+                                  dtype, (uint32_t)root, comm->comm(), stream));
    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
            << framework::product(out->dims());

--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(c_broadcast);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
 DECLARE_string(selected_npus);
@@ -53,26 +55,68 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
+  VLOG(2) << preStr << ":" << std::endl <<debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+  VLOG(2) << "rank_id = " << rank_id
-          << "; rank_id = " << rank_id
+  << "; device_id = " << device_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+  VLOG(3) << "break";
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+  memcpy(hccl_id, id, 1024);
+}
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+  memcpy(id, hccl_id, 1024);
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -80,7 +124,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
 void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  int num = 2;
  std::vector<float> init;
@@ -96,7 +140,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();
  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num, num});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -108,8 +152,8 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
  attrs["root"] = 0;
  attrs["ring_id"] = 0;
-  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+                              {{"Out", {"OutData"}}}, attrs);
  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
@@ -129,11 +173,11 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(c_broadcast, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
-  TestHCCLBroadcastOp(&scope, *ctx);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_comm_init_hcom_op.cc
+++ b/paddle/fluid/operators/collective/c_comm_init_hcom_op.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,66 +12,67 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/hccl_helper.h"
 #include <string>
-#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/npu_op_runner.h"
 namespace paddle {
 namespace framework {
 class Scope;
 }  // namespace framework
 }  // namespace paddle
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#endif
 namespace paddle {
 namespace operators {
-class CCommInitOpNPU : public framework::OperatorBase {
+class CCommInitOpAscend : public framework::OperatorBase {
 public:
-  CCommInitOpNPU(const std::string& type,
+  CCommInitOpAscend(const std::string& type, const framework::VariableNameMap& inputs,
-              const framework::VariableNameMap& inputs,
              const framework::VariableNameMap& outputs,
              const framework::AttributeMap& attrs)
      : OperatorBase(type, inputs, outputs, attrs) {}
  void RunImpl(const framework::Scope& scope,
               const platform::Place& place) const override {
-    int rid = Attr<int>("ring_id");
+    PADDLE_ENFORCE_EQ(is_npu_place(place), true,
-    int nranks = Attr<int>("nranks");
+                      platform::errors::PreconditionNotMet(
+                          "CCommInitOpAscend can run on npu place only."));
+    auto var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::InvalidArgument("Input con not be empty."));
+#if defined(PADDLE_WITH_ASCEND_CL)
+    HcclRootInfo* hccl_id = var->GetMutable<HcclRootInfo>();
+    int rank_ids = Attr<int>("rank_ids");
    int rank_id = Attr<int>("rank");
+    int rid = Attr<int>("ring_id");
    int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device;
    if (Attr<int>("device_id") >= 0) {
      device_id = Attr<int>("device_id");
    }
-    std::vector<int> rank_ids = Attr<std::vector<int>>("rank_ids");
-    VLOG(3) << "begin c_comm_init on npu, parameters are: "
-            << "ring id[" << rid
-            << "], nranks[" << nranks
-            << "], rank_id[" << rank_id
-            << "], device_id[" << device_id
-            << "]";
    platform::HCCLCommContext::Instance().CreateHCCLComm(
-        rank_ids, rank_id, device_id, rid);
+        hccl_id, rank_ids, rank_id, device_id, rid);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
  }
 };
-class CCommInitOpNPUMaker : public framework::OpProtoAndCheckerMaker {
+class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
+    AddInput("X", "Raw variable contains a NCCL UniqueId instaces.");
    AddComment(R"DOC(
-CCommInit operator on NPU
+CCommInit operator
-Initialize collective communication context within this trainer
+Initialize collective communicatoin context within this trainer
 )DOC");
-    AddAttr<int>("nranks", "(int) The number of ranks of distributed trainers");
+    AddAttr<int>("rank_ids", "(int) The number of ranks of distributed trainers");
-    AddAttr<std::vector<int>>("rank_ids", "The world rank ids of the group");
    AddAttr<int>("rank",
                 "(int) The rank of the trainer in distributed training.");
    AddAttr<int>("device_id",
@@ -89,6 +90,4 @@ Initialize collective communication context within this trainer
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(c_comm_init_hcom, ops::CCommInitOpNPU, ops::CCommInitOpNPUMaker);
+REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend, ops::CCommInitOpAscendMaker);
-#endif
--- a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/place.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+#ifdef PADDLE_WITH_ASCEND_CL
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+  }
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    int rank = Attr<int>("rank");
+    framework::Scope& local_scope = scope.NewScope();
+    std::function<std::string(size_t)> func = [&](size_t i) -> std::string {
+      return Output("Out");
+    };
+    if (rank == 0) {
+      std::vector<std::string> endpoint_list =
+          Attr<std::vector<std::string>>("other_endpoints");
+      SendBroadCastHCCLID(endpoint_list, 1, func, local_scope);
+    } else {
+      std::string endpoint = Attr<std::string>("endpoint");
+      RecvBroadCastHCCLID(endpoint, 1, func, local_scope);
+    }
+    scope.DeleteScope(&local_scope);
+  }
+};
+#else
+class CGenHCCLIdOp : public framework::OperatorBase {
+ public:
+  CGenHCCLIdOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+  }
+};
+#endif
+class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    VLOG(3) << "ele";
+    AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+CGenHCCLId operator
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string), e.g. 127.0.0.1:6175 "
+                         "current listen endpoint");
+    AddAttr<std::vector<std::string>>(
+        "other_endpoints",
+        "['trainer1_ip:port', 'trainer2_ip:port', ...] "
+        "list of other trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("rank",
+                 "(int default 0) "
+                 "The rank of the trainer in distributed training.")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker);
--- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
+++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc
@@ -63,6 +63,7 @@ class CGenNCCLIdOp : public framework::OperatorBase {
 };
 #else
 class CGenNCCLIdOp : public framework::OperatorBase {
 public:
  CGenNCCLIdOp(const std::string& type,

--- a/paddle/fluid/operators/collective/c_reduce_op.h
+++ b/paddle/fluid/operators/collective/c_reduce_op.h
@@ -121,31 +121,15 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    // we need to pre-allocate 512 Bytes before the data
-    // and 512 Bytes after the data, so the hccl allreduce
-    // can work. This is a must acooding to huawei peer.
-    #define PRE_MALLOC_SIZE_BYTES 512
    auto in = ctx.Input<framework::LoDTensor>("X");
    auto out = ctx.Output<framework::LoDTensor>("Out");
    auto place = ctx.GetPlace();
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
    int64_t numel = in->numel();
-    int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T);
+    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
-    int64_t tmp_numel = numel + pre_tmp_size * 2;
+    void* recvbuff = reinterpret_cast<void*>(out->data<T>());
-    paddle::framework::LoDTensor tmp_in, tmp_out;
-    tmp_in.Resize({tmp_numel});
-    tmp_out.Resize({tmp_numel});
-    auto p_tmp_in = tmp_in.mutable_data<T>(place);  // allocate
-    auto p_tmp_out = tmp_out.mutable_data<T>(place);  // allocate
-    void* sendbuff = reinterpret_cast<void*>(tmp_in.data<T>() + pre_tmp_size);
-    void* recvbuff = reinterpret_cast<void*>(tmp_out.data<T>() + pre_tmp_size);
-    std::string tag = ctx.Attr<std::string>("tag");
    int ring_id = ctx.Attr<int>("ring_id");
    int root_id = ctx.Attr<int>("root_id");
    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
@@ -161,33 +145,22 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
    int rank_id = comm->rank();
-    // we need to memset this memory firstly to avoid core by hccl
+    HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM;
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_in), 0, tmp_numel*sizeof(T), stream);
-    platform::NPUMemsetAsync(static_cast<void*>(p_tmp_out), 0, tmp_numel*sizeof(T), stream);
-    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
-    memory::Copy(npu_place, sendbuff,
-                 npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
-                 numel * sizeof(T),
-                 stream);
-    hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
    switch (red_type) {
      case kRedSum:
-        hccl_red_type = HCCL_REP_OP_SUM;
+        hccl_red_type = HCCL_REDUCE_SUM;
        break;
      case kRedMax:
-        hccl_red_type = HCCL_REP_OP_MAX;
+        hccl_red_type = HCCL_REDUCE_MAX;
        break;
      case kRedMin:
-        hccl_red_type = HCCL_REP_OP_MIN;
+        hccl_red_type = HCCL_REDUCE_MIN;
        break;
      case kRedProd:
-        hccl_red_type = HCCL_REP_OP_PROD;
+        hccl_red_type = HCCL_REDUCE_PROD;
        break;
      default:
@@ -200,18 +173,14 @@ class CReduceOpASCENDKernel : public framework::OpKernel<T> {
      << "root_id: " << root_id
      << "dtype: " << dtype
      << "hccl_red_type: " << hccl_red_type
-      << ", group is: " << group
+      << ", group is: " << group;
-      << ", tag is " << tag;
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce(
-        tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
+        sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), (void*)stream));
-    if(rank_id == root_id){
-      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
+    if(rank_id != root_id){
-                  npu_place, recvbuff,
+      auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
-                  numel * sizeof(T),
-                  stream);
-    }else{
      memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
            npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
            numel * sizeof(T),

--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,7 +43,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(c_reduce_sum);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
 DECLARE_string(selected_npus);
@@ -56,23 +58,65 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+  VLOG(2) << "rank_id = " << rank_id
-          << "; rank_id = " << rank_id
+  << "; device_id = " << device_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+  VLOG(3) << "break";
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+  memcpy(hccl_id, id, 1024);
+}
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+  memcpy(id, hccl_id, 1024);
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
@@ -80,7 +124,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
 void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  int rank_id = atoi(getenv("RANK_ID"));
@@ -99,7 +143,7 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  tensor_x->Resize({num1, num2});
  ctx.Wait();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -112,8 +156,10 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  int root_id = 0;
  attrs["root_id"] = root_id;
-  auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"X"}}},
+  auto op = f::OpRegistry::CreateOp("c_reduce_sum",
-                                    {{"Out", {"Out"}}}, attrs);
+                                    {{"X", {"Data"}}},
+                                    {{"Out", {"OutData"}}},
+                                    attrs);
  op->Run(*scope, place);
  ctx.Wait();
@@ -136,14 +182,15 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
 TEST(c_reduce_sum, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 2; i++) {
+  Prepare(&scope, ctx, &hccl_id);
+  for(int i = 0; i < 2; i ++){
    VLOG(2) << "iter num: " << i;
-    TestHCCLReduceOp(&scope, *ctx, i);
+    TestHCCLReduceOp(&scope, ctx, i);
  }
 }
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -35,7 +35,6 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();
    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
    int nranks = comm->nranks();
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
    auto out_dims = in->dims();
    PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
@@ -47,11 +46,11 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
    out_dims[0] = out_dims[0] / nranks;
    out->mutable_data<T>(out_dims, place);
-    int64_t recv_numel = in->numel() / nranks;
+    uint64_t recv_numel = in->numel() / nranks;
    void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
    void* outputPtr = reinterpret_cast<void*>(out->data<T>());
-    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+    HcclDataType dtype = platform::ToHCCLDataType(in->type());
    aclrtStream stream = nullptr;
    if (ctx.Attr<bool>("use_calc_stream")) {
@@ -63,12 +62,11 @@ class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
    VLOG(3) << "begin hccl reduce scatter, parameter is: "
      << "recv_numel: " << recv_numel
      << "dtype: " << dtype
-      << "hccl_red_type: " << HCCL_REP_OP_SUM
+      << "hccl_red_type: " << HCCL_REDUCE_SUM
-      << ", group is: " << group
+      << ", group is: " << group;
-      << ", tag is " << tag;
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_reduce_scatter(
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter(
-        tag.c_str(), inputPtr, outputPtr, (u64)recv_numel, dtype, HCCL_REP_OP_SUM, group.c_str(), (void*)stream));
+        inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(), (void*)stream));
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));

--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -34,6 +34,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -45,7 +46,8 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(c_reducescatter);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
 DECLARE_string(selected_npus);
@@ -59,7 +61,8 @@ void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
@@ -68,22 +71,63 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+  VLOG(3) << "break";
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+  memcpy(hccl_id, id, 1024);
+}
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+  memcpy(id, hccl_id, 1024);
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
-  comm_init_attrs["rank_ids"] = rank_ids;
+  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
 void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
-  auto x = scope->Var("X");
+  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  std::vector<float> init;
@@ -101,7 +145,7 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();
  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
+  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
@@ -114,14 +158,14 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  attrs["ring_id"] = 0;
  attrs["nranks"] = 2;
-  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"X"}}},
+  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+                              {{"Out", {"OutData"}}}, attrs);
  int iter_num = 10;
  for (int i = 0; i < iter_num; i++) {
    op->Run(*scope, place);
+    ctx.Wait();
  }
-  ctx.Wait();
  std::vector<float> out_vec;
  TensorToVector(*tensor_out, ctx, &out_vec);
@@ -130,17 +174,18 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  PrintDebugInfo("output data", out_vec);
  EXPECT_EQ(out_vec.size(), init.size() / 2);
  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], iter_num + 1);
+    EXPECT_EQ(out_vec[i], 2.0);
  }
 }
 TEST(c_reducescatter, NPU) {
  f::Scope scope;
+  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
-  auto* ctx = p::DeviceContextPool::Instance().Get(
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, *ctx);
+  PrepareUniqueId(&scope, ctx, &hccl_id);
-  TestHCCLReduceScatterOp(&scope, *ctx);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLReduceScatterOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -41,7 +41,7 @@ namespace m = paddle::operators::math;
 USE_OP(c_broadcast);
 USE_NO_KERNEL_OP(c_sync_comm_stream);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
 void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {

--- a/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <ostream>
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/string/split.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+namespace paddle {
+namespace operators {
+#ifdef PADDLE_WITH_ASCEND_CL
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+    std::vector<std::string> trainers =
+        Attr<std::vector<std::string>>("trainers");
+    int trainer_id = Attr<int>("trainer_id");
+    std::string endpoint = trainers[trainer_id];
+    PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument(
+                                         "trainer_id %d is less than 0. Its "
+                                         "valid range is [0, trainer_size)"));
+    PADDLE_ENFORCE_LT(
+        trainer_id, static_cast<int>(trainers.size()),
+        platform::errors::OutOfRange("trainer_id %d is out of range. Its valid "
+                                     "range is [0, trainer_size)",
+                                     trainer_id));
+    int hccl_comm_num = Attr<int>("hccl_comm_num");
+    int use_hierarchical_allreduce = Attr<bool>("use_hierarchical_allreduce");
+    int inter_nranks = Attr<int>("hierarchical_allreduce_inter_nranks");
+    int inter_trainer_id = -1;
+    int exter_trainer_id = -1;
+    if (use_hierarchical_allreduce) {
+      PADDLE_ENFORCE_GT(
+          trainers.size(), 1,
+          platform::errors::PreconditionNotMet(
+              "The number of collective trainers %llu <= 1", trainers.size()));
+      PADDLE_ENFORCE_GT(
+          inter_nranks, 1,
+          platform::errors::PreconditionNotMet(
+              "inter_nranks %d <= 1 while in hierarchical allreduce mode",
+              inter_nranks));
+      PADDLE_ENFORCE_EQ(
+          trainers.size() % inter_nranks, 0,
+          platform::errors::PreconditionNotMet(
+              "The number of trainers %llu mod inter_nranks %d is not equal 0",
+              trainers.size(), inter_nranks));
+      inter_trainer_id = trainer_id % inter_nranks;
+      if (trainer_id % inter_nranks == 0) {
+        exter_trainer_id = trainer_id / inter_nranks;
+      }
+    }
+    std::ostringstream ss;
+    for (size_t i = 0; i < trainers.size(); i++) {
+      ss << trainers[i] << ",";
+    }
+    VLOG(1) << "trainer_id:" << trainer_id
+            << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce
+            << ", hccl_comm_num:" << hccl_comm_num
+            << ", inter_nranks:" << inter_nranks
+            << ", inter_trainer_id:" << inter_trainer_id
+            << ", exter_trainer_id:" << exter_trainer_id
+            << ", trainers:" << ss.str();
+    int server_fd = -1;
+    /// 1. init flat
+    std::function<std::string(size_t)> func = platform::GetFlatHCCLVarName;
+    if (trainer_id == 0) {
+      // server endpoints
+      std::vector<std::string> flat_endpoints;
+      flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1,
+                            trainers.end());
+      SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope);
+    } else {
+      server_fd = CreateListenSocket(endpoint);
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+    /// 2. hierarchical inter ncclid
+    func = platform::GetHierarchicalInterHCCLVarName;
+    if (inter_trainer_id == 0) {
+      std::ostringstream ss;
+      ss << endpoint;
+      std::vector<std::string> inter_endpoints;
+      for (int i = trainer_id + 1; i < trainer_id + inter_nranks &&
+                                   i < static_cast<int>(trainers.size());
+           i++) {
+        ss << ",";
+        inter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str();
+      SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope);
+    } else if (inter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical inter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+    /// 3. hierarchical exter ncclid
+    func = platform::GetHierarchicalExterHCCLVarName;
+    if (exter_trainer_id == 0) {
+      std::ostringstream ss;
+      std::vector<std::string> exter_endpoints;
+      ss << endpoint;
+      for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) {
+        ss << ",";
+        exter_endpoints.push_back(trainers[i]);
+        ss << trainers[i];
+      }
+      VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str();
+      SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope);
+    } else if (exter_trainer_id > 0) {
+      VLOG(1) << "Hierarchical exter ring";
+      RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope);
+    }
+    // close socket server
+    if (trainer_id != 0) {
+      CloseSocket(server_fd);
+    }
+  }
+};
+#else
+class GenHCCLIdOp : public framework::OperatorBase {
+ public:
+  GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs,
+              const framework::VariableNameMap& outputs,
+              const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& dev_place) const override {
+  }
+};
+#endif
+class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces.");
+    AddComment(R"DOC(
+GenHCCLId operator
+For trainer 0: generate a new UniqueId and send it to all the other trainers.
+For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server.
+)DOC");
+    AddAttr<std::vector<std::string>>(
+        "trainers",
+        "['trainer0_ip:port', 'trainer1_ip:port', ...] "
+        "list of all trainer endpoints")
+        .SetDefault({});
+    AddAttr<int>("trainer_id",
+                 "(int) "
+                 "The index of the trainer in distributed training.");
+    AddAttr<int>("hccl_comm_num",
+                 "(int default 1) "
+                 "The number of nccl communicator num.")
+        .SetDefault(1);
+    AddAttr<bool>("use_hierarchical_allreduce",
+                  "(bool default false) "
+                  "Wheter to use hierarchical allreduce.")
+        .SetDefault(false);
+    AddAttr<int>("hierarchical_allreduce_inter_nranks",
+                 "(int default 1) "
+                 "Wheter to use hierarchical allreduce.")
+        .SetDefault(-1);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker);
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <netinet/in.h>
+#include <stdlib.h>
+#include <sys/socket.h>
+#include <algorithm>
+#include <ostream>
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/var_type_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/split.h"
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+namespace paddle {
+namespace operators {
+constexpr char COMM_HEAD[] = "_pd_gen_comm_id_";
+#define HCCL_UNIQUE_ID_BYTES 1024
+// Check system calls, such as socket, bind.
+#define CHECK_SYS_CALL(call, name)          \
+  do {                                      \
+    int retval;                             \
+    CHECK_SYS_CALL_VAL(call, name, retval); \
+  } while (false)
+#define CHECK_SYS_CALL_VAL(call, name, retval)                            \
+  do {                                                                    \
+    RETRY_SYS_CALL_VAL(call, name, retval);                               \
+    if (retval == -1) {                                                   \
+      PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \
+                                                 name, strerror(errno))); \
+    }                                                                     \
+  } while (false)
+#define RETRY_SYS_CALL_VAL(call, name, retval)                           \
+  do {                                                                   \
+    retval = (call);                                                     \
+    if (retval == -1 &&                                                  \
+        (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) {   \
+      LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \
+                   << " retry";                                          \
+    } else {                                                             \
+      break;                                                             \
+    }                                                                    \
+  } while (true)
+static int SocketSend(int fd, const char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = send(fd, buffer + offset, size - offset, 0);
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        // send failed
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+static int SocketRecv(int fd, char* buffer, int size) {
+  int offset = 0;
+  int bytes = 0;
+  while (offset < size) {
+    bytes = recv(fd, buffer + offset, size - offset, 0);
+    if (bytes == 0) {
+      // closed by client, maybe probing alive client
+      return 0;
+    }
+    if (bytes == -1) {
+      if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) {
+        return -1;
+      } else {
+        bytes = 0;
+      }
+    }
+    offset += bytes;
+  }
+  return offset;
+}
+static void BindOrConnectFailed(int timeout, int* try_times, int* total_time,
+                                const char* op, const std::string& ep) {
+  PADDLE_ENFORCE_LT(
+      *total_time, timeout,
+      platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op,
+                                    ep.c_str(), strerror(errno)));
+  ++(*try_times);
+  int retry_time = std::min(*try_times * 500, 3000);  // max 3 seconds
+  *total_time += retry_time;
+  LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times
+               << " times with reason: " << strerror(errno) << " retry after "
+               << retry_time / 1000.0 << " seconds";
+  std::this_thread::sleep_for(std::chrono::milliseconds(retry_time));
+}
+int CreateListenSocket(const std::string& ep) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+  // creating socket fd
+  int server_fd = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd);
+  // NOTE. Solutions to `Address already in use`.
+  // 1. Reuse addr&port. Otherwise, once the server closes the socket
+  // before client, the server will enter TIME-WAIT status. If we bind port
+  // again, the error `Address already in use` will appear.
+  // 2. Or we can close the client first to ensure that the server does
+  // not enter the TIME-WAIT state. But this is obviously not as convenient
+  // as the reuse method.
+  int opt = 1;
+#if defined(SO_REUSEPORT)
+  // since Linux kernel 3.9
+  CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT,
+                            &opt, sizeof(opt)),
+                 "setsockopt");
+#else
+  CHECK_SYS_CALL(
+      setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)),
+      "setsockopt");
+#endif
+  struct sockaddr_in address;
+  address.sin_family = AF_INET;
+  address.sin_addr.s_addr = INADDR_ANY;
+  address.sin_port = htons(port);
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind",
+        ret_val);
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep);
+      continue;
+    }
+    break;
+  }
+  CHECK_SYS_CALL(listen(server_fd, 3), "listen");
+  LOG(INFO) << "Server listening on: " << ep << " successful.";
+  return server_fd;
+}
+void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); }
+static int SocketAccept(int server_fd, const char* head) {
+  struct sockaddr_in client_addr;
+  socklen_t addr_length = sizeof(client_addr);
+  char buffer[1024] = {0};
+  int conn = -1;
+  while (true) {
+    CHECK_SYS_CALL_VAL(
+        accept(server_fd, reinterpret_cast<struct sockaddr*>(&client_addr),
+               &addr_length),
+        "accept", conn);
+    int ret_val = SocketRecv(conn, buffer, strlen(head));
+    if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) {
+      break;  // accept client
+    } else {
+      VLOG(3) << "socket read failed with ret_val=" << ret_val;
+      CloseSocket(conn);
+    }
+  }
+  return conn;
+}
+static int ConnectAddr(const std::string& ep, const char* head) {
+  auto addr = paddle::string::Split(ep, ':');
+  PADDLE_ENFORCE_EQ(
+      addr.size(), 2UL,
+      platform::errors::InvalidArgument(
+          "The endpoint should contain host and port, but got %s.", ep));
+  std::string host = addr[0];
+  int port = std::stoi(addr[1]);
+  int sock = -1;
+  CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock);
+  struct sockaddr_in server_addr;
+  memset(&server_addr, 0, sizeof(server_addr));
+  server_addr.sin_family = AF_INET;
+  server_addr.sin_port = htons(port);
+  char* ip = NULL;
+  struct hostent* hp = NULL;
+  hp = gethostbyname(host.c_str());
+  PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
+                                  "Fail to get host by name %s.", host));
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+  PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0,
+                    platform::errors::Unavailable("Open address %s failed: %s",
+                                                  ep, strerror(errno)));
+  // TODO(wangxi) Set from env, default 900s=15min
+  int timeout = 900 * 1000;
+  int try_times = 0;
+  int total_time = 0;
+  while (true) {
+    int ret_val = -1;
+    RETRY_SYS_CALL_VAL(
+        connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)),
+        "connect", ret_val);
+    if (ret_val == -1) {
+      BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep);
+      continue;
+    }
+    CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send");
+    break;
+  }
+  return sock;
+}
+static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  static_assert(HCCL_UNIQUE_ID_BYTES <= 1024,
+                "hccl id bytes must <= buffer size");
+  CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES), "recv hccl id");
+  memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES);
+}
+static void SendHCCLID(int conn, HcclRootInfo* hccl_id) {
+  char buffer[1024] = {0};
+  memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES);
+  CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES),
+                 "send hccl id");
+}
+void SendBroadCastHCCLID(std::vector<std::string> servers, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  // connect with server
+  std::vector<int> connects;
+  for (auto server : servers) {
+    VLOG(3) << "connecting endpoint: " << server;
+    int conn = ConnectAddr(server, COMM_HEAD);
+    connects.push_back(conn);
+  }
+  VLOG(3) << "connecting completed...";
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id));
+    int j = 0;
+    for (auto conn : connects) {
+      VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j]
+              << " hccl_comm_no: " << i;
+      SendHCCLID(conn, hccl_id);
+      ++j;
+    }
+    VLOG(3) << "sending completed...";
+  }
+  // close client
+  for (auto conn : connects) {
+    CloseSocket(conn);
+  }
+}
+void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int server = CreateListenSocket(endpoint);
+  RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope);
+  CloseSocket(server);
+}
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope) {
+  int client = SocketAccept(server_fd, COMM_HEAD);
+  for (int i = 0; i < hccl_comm_num; ++i) {
+    std::string var_name = func(i);
+    auto var = scope.FindVar(var_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        var, platform::errors::NotFound("Variable with name %s is not found",
+                                        var_name.c_str()));
+    auto hccl_id = var->GetMutable<HcclRootInfo>();
+    VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name
+            << " from trainer 0, hccl_comm_no: " << i;
+    RecvHCCLID(client, hccl_id);
+  }
+  VLOG(3) << "receiving completed...";
+  CloseSocket(client);
+}
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
+++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <functional>
+#include <string>
+#include <vector>
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+namespace paddle {
+namespace operators {
+int CreateListenSocket(const std::string& ep);
+void CloseSocket(int fd);
+void SendBroadCastHCCLID(std::vector<std::string> servers, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+// server listen on endpoint, then recv nccl id
+void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+// recv nccl id from socket
+void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num,
+                         std::function<std::string(size_t)> func,
+                         const framework::Scope& scope);
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/recv_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -27,32 +27,39 @@ class CRecvOpASCENDKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
-    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto x = ctx.Output<framework::LoDTensor>("Out");
-    int numel = out->numel();
+    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
-    hcclDataType_t dtype = platform::ToHCCLDataType(out->type());
+    int numel = x->numel();
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
    int ring_id = ctx.Attr<int>("ring_id");
    auto place = ctx.GetPlace();
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
-    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    int nranks = comm->nranks();
-    int srcRank = ctx.Attr<int>("peer");
+    int peer = ctx.Attr<int>("peer");
-    int srTag = ctx.Attr<int>("srTag");
-    VLOG(3) << "recv_v2_npu attr get";
+    PADDLE_ENFORCE_EQ(nranks, 2,
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_receive(
+                platform::errors::InvalidArgument(
-        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(out->data<T>())), (u64)numel, dtype, srcRank,
+                    "The nranks must be 2, but (%d)",
-          srTag, group.c_str(), stream));
+                      nranks));
-     VLOG(3) << "Source Rank: " << srcRank << " Invoke hcom receive. receiving ";
-    out->Resize(out->dims());
+    int root = peer;
-    out->set_lod(out->lod());
+    VLOG(3) << "begin hccl recv, parameter is: "<< "root " << root
+      << ", comm: " << comm->comm() << ", stream: " << stream;
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
+                                  dtype, (uint32_t)root, comm->comm(), stream));
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(
        "PaddlePaddle should compile with NPU."));

--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -31,6 +31,8 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,83 +44,126 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(recv_v2);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(recv_v2, NPU);
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
-  std::string rank_table_file = getenv("RANK_TABLE_FILE");
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  int src_rank = atoi(getenv("SRC_RANK"));
-  int dest_rank = atoi(getenv("DEST_RANK"));
-  VLOG(3) << "rank_id " << rank_id << "src_rank" << src_rank << "dest_rank"
-          << dest_rank;
-  std::vector<int> rank_ids = {0, 1};
+  VLOG(2) << "rank_id = " << rank_id
-  f::AttributeMap comm_init_attrs;
+  << "; device_id = " << device_id
-  comm_init_attrs["ring_id"] = 0;
+  << "; rank_id = " << rank_id
-  comm_init_attrs["nranks"] = 2;
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
+  std::vector<int> rank_ids{0, 1};
-  comm_init_attrs["rank_ids"] = rank_ids;
+  f::AttributeMap gen_hccl_id;
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+  VLOG(3) << "break";
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "CreateOp c_comm_init_hcom";
+  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
+  memcpy(hccl_id, id, 1024);
 }
-void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+  memcpy(id, hccl_id, 1024);
-  int num = atoi(getenv("DATA_SIZE"));
-  EXPECT_GT(num, 0);
-  EXPECT_LT(num, 1 << 15);
  int rank_id = atoi(getenv("RANK_ID"));
-  VLOG(3) << "rank_id:" << rank_id << std::endl;
+  int device_id = atoi(getenv("DEVICE_ID"));
-  ctx.Wait();
+  VLOG(2) << "rank_id = " << rank_id
-  auto place = ctx.GetPlace();
+  << "; device_id = " << device_id
-  auto out = scope->Var("Out");
+  << "; rank_id = " << rank_id
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
  ctx.Wait();
+}
-  f::AttributeMap attrs;
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
-  attrs["tag"] = std::string("srtest");
+    std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
-  attrs["peer"] = atoi(getenv("SRC_RANK"));
-  attrs["ring_id"] = 0;
+    int num = atoi(getenv("DATA_SIZE"));
-  attrs["srTag"] = 0;
+    EXPECT_GT(num, 0);
-  std::vector<int> out_shape;
+    EXPECT_LT(num, 1 << 15);
-  out_shape.push_back(num);
+    int rank_id = atoi(getenv("RANK_ID"));
-  out_shape.push_back(num);
+    VLOG(3) << "rank_id:" << rank_id<<std::endl;
-  attrs["out_shape"] = out_shape;
+    ctx.Wait();
-  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
+    auto place = ctx.GetPlace();
-  VLOG(3) << "CreateOp recv_v2";
+    auto out = scope->Var("Data");
+    auto tensor_out = out->GetMutable<f::LoDTensor>();
-  for (int i = 0; i < 10; i++) {
+    tensor_out->Resize({num, num});
-    op->Run(*scope, place);
+    tensor_out->mutable_data<float>(place);  // allocate
-  }
-  VLOG(3) << "Run op recv_v2";
+    ctx.Wait();
-  std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
+    f::AttributeMap attrs;
-  ctx.Wait();
+    attrs["tag"]=std::string("srtest");
-  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+    attrs["peer"]=atoi(getenv("SRC_RANK"));
-  EXPECT_EQ(out_vec == init, true);
+    attrs["ring_id"]=0;
+    attrs["srTag"]=0;
+    std::vector<int> out_shape;
+    out_shape.push_back(num);
+    out_shape.push_back(num);
+    attrs["out_shape"]=out_shape;
+    auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs);
+    VLOG(3) << "CreateOp recv_v2";
+    for (int i = 0; i < 10; i ++) {
+      op->Run(*scope, place);
+    }
+    VLOG(3) << "Run op recv_v2";
+    std::vector<float> out_vec;
+    TensorToVector(*tensor_out, ctx, &out_vec);
+    ctx.Wait();
+    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+    EXPECT_EQ(out_vec == init, true);
 }
-TEST(recv_v2, NPU) {
-  f::Scope scope;
+TEST(recv_v2, NPU){
-  char* npu_id = getenv("FLAGS_selected_npus");
+    f::Scope scope;
-  VLOG(3) << "Select npu:" << npu_id;
+    HcclRootInfo hccl_id;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
-  VLOG(3) << "Place over";
+    char * npu_id=getenv("FLAGS_selected_npus");
-  Prepare(&scope, *ctx);
+    VLOG(3) << "Select npu:" << npu_id;
-  VLOG(3) << "Prepare over";
+    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-  TestHcomRecvOp(&scope, *ctx);
-  VLOG(3) << "Test over";
+    PrepareUniqueId(&scope, ctx, &hccl_id);
+    Prepare(&scope, ctx, &hccl_id);
+    TestHcomRecvOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/send_v2_op_npu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -28,31 +28,37 @@ class CSendOpASCENDKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
    auto x = ctx.Input<framework::LoDTensor>("X");
+    void *ptr = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
    int numel = x->numel();
-    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+    HcclDataType dtype = platform::ToHCCLDataType(x->type());
-    auto place = ctx.GetPlace();
    int ring_id = ctx.Attr<int>("ring_id");
-    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    auto place = ctx.GetPlace();
+    auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
    aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
    if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
    } else {
      stream = comm->stream();
    }
-    std::string tag = std::to_string(ring_id) + "_" + std::to_string(comm->NextTagId());
-    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-    int destRank = ctx.Attr<int>("peer");
-    int srTag = ctx.Attr<int>("srTag");
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_send(
+    int nranks = comm->nranks();
-        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), (u64)numel, dtype, destRank,
+    int rank = comm->rank();
-          srTag, group.c_str(), stream));
+    PADDLE_ENFORCE_EQ(nranks, 2,
+                  platform::errors::InvalidArgument(
+                      "The nranks must be 2, but (%d)",
+                       nranks));
+    int root = rank;
+    VLOG(3) << "begin hccl send, parameter is: "<< "root " << root
+      << ", comm: " << comm->comm() << ", stream: " << stream;
-      VLOG(3) << "Dest rank:" << destRank << " Invoke hcom send. Sent "
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(ptr, numel,
-              << x->numel();
+                                  dtype, (uint32_t)root, comm->comm(), stream));
 #else
    PADDLE_THROW(platform::errors::PreconditionNotMet(

--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -30,6 +30,7 @@ limitations under the License. */
 #include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/send_v2_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -41,72 +42,115 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 USE_OP(send_v2);
-USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(send_v2, NPU);
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::string rank_table_file = getenv("RANK_TABLE_FILE");
+void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  int src_rank = atoi(getenv("SRC_RANK"));
-  int dest_rank = atoi(getenv("DEST_RANK"));
-  VLOG(3) << "rank_id " << rank_id << "src_rank" << src_rank << "dest_rank"
-          << dest_rank;
-  std::vector<int> rank_ids = {0, 1};
+  VLOG(2) << "rank_id = " << rank_id
-  f::AttributeMap comm_init_attrs;
+  << "; device_id = " << device_id
-  comm_init_attrs["ring_id"] = 0;
+  << "; rank_id = " << rank_id
-  comm_init_attrs["nranks"] = 2;
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
+  std::vector<int> rank_ids{0, 1};
-  comm_init_attrs["rank_ids"] = rank_ids;
+  f::AttributeMap gen_hccl_id;
+  std::vector<std::string > endpointList={"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints= {endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+  VLOG(3) << "break";
  auto comm_init_op =
-      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+      f::OpRegistry::CreateOp("c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
+  memcpy(hccl_id, id, 1024);
 }
-void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx, HcclRootInfo* hccl_id){
-  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  auto id = x->GetMutable<HcclRootInfo>();
-  int num = atoi(getenv("DATA_SIZE"));
+  memcpy(id, hccl_id, 1024);
-  EXPECT_GT(num, 0);
-  EXPECT_LT(num, 1 << 15);
-  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
  int rank_id = atoi(getenv("RANK_ID"));
-  VLOG(3) << "rank id:" << rank_id;
+  int device_id = atoi(getenv("DEVICE_ID"));
-  TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
+  VLOG(2) << "rank_id = " << rank_id
-  ctx.Wait();
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
  ctx.Wait();
+}
-  f::AttributeMap attrs;
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
-  attrs["tag"] = std::string("srtest");
+    std::cout<< "BEGIN TEST:"<< __FUNCTION__ <<std::endl;
-  attrs["peer"] = atoi(getenv("DEST_RANK"));
+    auto x = scope->Var("Data");
-  attrs["ring_id"] = 0;
+    auto tensor_x = x->GetMutable<f::LoDTensor>();
-  attrs["srTag"] = 0;
+    int num = atoi(getenv("DATA_SIZE"));;
+    EXPECT_GT(num, 0);
+    EXPECT_LT(num, 1 << 15);
+    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+    int rank_id = atoi(getenv("RANK_ID"));
+    VLOG(3)<<"rank id:"<<rank_id;
+    TensorFromVector(init, ctx, tensor_x);
+    tensor_x->Resize({num, num});
+    ctx.Wait();
+    auto place = ctx.GetPlace();
+    ctx.Wait();
+    f::AttributeMap attrs;
+    attrs["tag"]=std::string("srtest");
+    attrs["peer"]=atoi(getenv("DEST_RANK"));
+    attrs["ring_id"]=0;
+    attrs["srTag"]=0;
+    auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs);
+    for (int i = 0; i < 10; i ++) {
+      op->Run(*scope, place);
+    }
+    VLOG(3)<<"send run over";
+    ctx.Wait();
+}
-  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
+TEST(send_v2, NPU){
+    f::Scope scope;
+    HcclRootInfo hccl_id;
-  for (int i = 0; i < 10; i++) {
+    char * npu_id=getenv("FLAGS_selected_npus");
-    op->Run(*scope, place);
+    VLOG(3) << "Select npu:" << npu_id;
-  }
+    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-  VLOG(3) << "send run over";
-  ctx.Wait();
-}
-TEST(send_v2, NPU) {
+    PrepareUniqueId(&scope, ctx, &hccl_id);
-  f::Scope scope;
+    Prepare(&scope, ctx, &hccl_id);
-  char* npu_id = getenv("FLAGS_selected_npus");
+    TestHcomSendOp(&scope, ctx);
-  VLOG(3) << "Select npu:" << npu_id;
-  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
-  VLOG(3) << "Place over";
-  Prepare(&scope, *ctx);
-  VLOG(3) << "Prepare over";
-  TestHcomSendOp(&scope, *ctx);
-  VLOG(3) << "Test over";
 }
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -157,15 +157,10 @@ class HCCLComm {
  virtual int nranks() const = 0;
  virtual int rank() const = 0;
  virtual int device_id() const = 0;
+  virtual HcclComm comm() const = 0;
  virtual aclrtStream stream() const = 0;
  virtual NPUDeviceContext* dev_context() const = 0;
  virtual ~HCCLComm() = default;
-  unsigned long NextTagId() {
-    return tag_counter_++;
-  }
- private:
-  std::atomic<unsigned long> tag_counter_;
 };
 // A singleton HCCL communicator context reserves communication ring ids
@@ -176,11 +171,12 @@ class HCCLCommContext {
    return comm_ctx;
  }
-  HCCLComm* CreateHCCLComm(const std::vector<int>& world_rank_ids, int rank, int dev_id,  int ring_id = 0);
+  HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                          int rank, int dev_id, int ring_id);
  // a latter comm with the same dev_id and the same ring_id
  // will override the former
-  HCCLComm* AssignHCCLComm(int nranks, int rank, int dev_id, int ring_id = 0);
+  HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id);
  // retrieve a communicator by the ring id in multiprocessing mode
  HCCLComm* Get(int ring_id) const {
@@ -217,20 +213,21 @@ class HCCLCommContext {
 private:
  // Init global hcom
-  HCCLCommContext() { InitHcomWorldGroup(); }
+  HCCLCommContext() {}
+  // we may use group feature in the feature
+  // HCCLCommContext() { InitHcomWorldGroup(); }
+  HcclComm comm_;
 public:
-  ~HCCLCommContext(){
+  ~HCCLCommContext(){ }
-    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_destroy());
-  }
  std::once_flag once_flag_;
  std::mutex comm_map_mutex_;
  // ring id to dev-HCCLComm
  std::map<int, std::map<int, std::unique_ptr<HCCLComm>>> comm_map_;
-  void InitHcomWorldGroup();
+  // void InitHcomWorldGroup();
  void ReleaseHCCLComms();
  DISABLE_COPY_AND_ASSIGN(HCCLCommContext);

--- a/paddle/fluid/platform/collective_helper_npu.cc
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@@ -34,6 +34,13 @@ class HCCLCommImpl : public HCCLComm {
    return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device;
  }
+  ~HCCLCommImpl(){
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_));
+  }
+  void set_comm(HcclComm comm) { comm_ = comm; }
+  HcclComm comm() const override { return comm_; }
  aclrtStream stream() const override { return dev_ctx_->stream(); }
  void set_dev_ctx(std::unique_ptr<NPUDeviceContext>&& dev_ctx) {
@@ -45,46 +52,43 @@ class HCCLCommImpl : public HCCLComm {
  int ring_id_;
  int nranks_;
  int rank_;
+  HcclComm comm_;
  std::unique_ptr<NPUDeviceContext> dev_ctx_;
 };
-HCCLComm* HCCLCommContext::CreateHCCLComm(const std::vector<int>& world_rank_ids, int rank, int dev_id, int ring_id) {
+HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks,
+                                          int rank, int dev_id, int ring_id) {
+  PADDLE_ENFORCE_NOT_NULL(hccl_id,
+                          platform::errors::InvalidArgument(
+                              "The hccl unique id should not be null."));
  PADDLE_ENFORCE_GT(
-      world_rank_ids.size(), 1,
+      nranks, 1,
      platform::errors::InvalidArgument(
-          "Expected world_rank_ids.size() > 1. But received size is %d.", world_rank_ids.size()));
+          "Expected nranks > 1. But received nranks is %d.", nranks));
  PADDLE_ENFORCE_GE(rank, 0,
                    platform::errors::InvalidArgument(
                        "Expected rank >= 0. But received rank is %d.", rank));
  PADDLE_ENFORCE_LT(
-      rank, world_rank_ids.size(),
+      rank, nranks,
      platform::errors::InvalidArgument(
          "Expected rank < nranks. But received rank is %d, nranks is %d.",
-          rank, world_rank_ids.size()));
+          rank, nranks));
  PADDLE_ENFORCE_GE(
      dev_id, 0,
      platform::errors::InvalidArgument(
          "Expected dev_id >= 0. But received dev_id is %d.", dev_id));
-  PADDLE_ENFORCE_GE(
-      ring_id, 0,
-      platform::errors::InvalidArgument(
-          "Expected ring_id >= 0. But received ring_id is %d.", ring_id));
-  auto* comm_wrapper = AssignHCCLComm(world_rank_ids.size(), rank, dev_id, ring_id);
-  // HACK(sunpeng17): hcom API requires bind stream to a model
+  HcclComm comm;
-  // but we don't need model in Paddle, so we feed stream pointer as model pointer
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id));
  PADDLE_ENFORCE_NPU_SUCCESS(
-      platform::dynload::hcom_bind_model(comm_wrapper->stream(),
+      platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm));
-                                         comm_wrapper->stream()));
+ VLOG(1) << "initialized comm: " << &comm  << ", nranks: " << nranks << ", hccl_id: " << hccl_id << ", rank: " << rank;
-  // Get world_rank_ids registered in gen_nccl_id op
+  auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id);
-  std::string group_name = HCOM_GROUP_PREFIX + std::to_string(ring_id);
-  PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_create_group(
-      group_name.c_str(), world_rank_ids.size(), (unsigned int*)world_rank_ids.data()));
  VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id
-          << " has been created on device " << dev_id << ", group name: " << group_name;
+          << " has been created on device " << dev_id << ", with comm: " << comm_wrapper->comm();
  std::call_once(once_flag_, []() {
    std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); });
@@ -93,7 +97,8 @@ HCCLComm* HCCLCommContext::CreateHCCLComm(const std::vector<int>& world_rank_ids
  return comm_wrapper;
 }
-HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int ring_id) {
+HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank,
+                                          int dev_id, int ring_id) {
  std::unique_ptr<NPUDeviceContext> dev_ctx(
      new NPUDeviceContext(NPUPlace(dev_id)));
@@ -101,6 +106,7 @@ HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int
  c->set_ring_id(ring_id);
  c->set_nranks(nranks);
  c->set_rank(rank);
+  c->set_comm(comm);
  c->set_dev_ctx(std::move(dev_ctx));
  comm_map_mutex_.lock();
@@ -112,23 +118,14 @@ HCCLComm* HCCLCommContext::AssignHCCLComm(int nranks, int rank, int dev_id, int
  dev2comm.emplace(dev_id, std::unique_ptr<HCCLComm>(c));
  comm_map_mutex_.unlock();
-  return comm_map_[ring_id][dev_id].get();
+  if (ring_id == 0) {
-}
+    auto* dev_ctx = static_cast<platform::NPUDeviceContext*>(
+        platform::DeviceContextPool::Instance().Get(
-void HCCLCommContext::InitHcomWorldGroup() {
+            platform::NPUPlace(dev_id)));
-  const char *rank_table_file = getenv(ENV_RANK_TABLE_FILE);
+    dev_ctx->set_hccl_comm(comm);
-  PADDLE_ENFORCE_NOT_NULL(
+  }
-      rank_table_file,
-      platform::errors::InvalidArgument("The RANK_TABLE_FILE environment variable should not be null."));
-  const char *rank_id = getenv(ENV_RANK_ID);
-  PADDLE_ENFORCE_NOT_NULL(
-      rank_id,
-      platform::errors::InvalidArgument("The RANK_ID environment variable should not be null."));
-  PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_init(rank_table_file, rank_id));
+  return comm_map_[ring_id][dev_id].get();
-  VLOG(3) << "Successfully initialized hcom. rank_table_file: "
-    << rank_table_file << ", rank_id " << rank_id;
 }
 void HCCLCommContext::ReleaseHCCLComms() {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -185,11 +185,21 @@ class NPUDeviceContext : public DeviceContext {
  void WaitStreamCallback() const { return stream_->WaitCallback(); }
+#if defined(PADDLE_WITH_ASCEND_CL)
+  /*! \brief  Return hccl communicators. */
+  HcclComm hccl_comm() const { return hccl_comm_; }
+  /*! \brief  Set hccl communicators. */
+  void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; }
+#endif
 private:
  NPUPlace place_;
  aclrtContext context_;
-#ifdef PADDLE_WITH_ASCEND_HCCL
-  HCCLContext_t hccl_context_;
+#ifdef PADDLE_WITH_ASCEND_CL
+  // HCCLContext_t hccl_context_;
+  HcclComm hccl_comm_{nullptr};
 #endif
  // Need to be the same with other DeviceContext,

--- a/paddle/fluid/platform/dynload/hccl.h
+++ b/paddle/fluid/platform/dynload/hccl.h
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-// #include <hccl/hccl.h>
+#include <hccl/hccl.h>
-// #include <hccl/hccl_types.h>
+#include <hccl/hccl_types.h>
 #include <mutex>  // NOLINT
 #include "paddle/fluid/platform/port.h"
-#include "paddle/fluid/platform/dynload/hcom.h"
+// #include "paddle/fluid/platform/dynload/hcom.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#define HCOM_GROUP_PREFIX "HCOM_GROUP_"
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -43,27 +45,14 @@ extern void* hccl_dso_handle;
  extern DynLoad__##__name __name
 #define HCCL_RAND_ROUTINE_EACH(__macro)         \
-  __macro(hcom_init);                           \
+  __macro(HcclReduceScatter);                   \
-  __macro(hcom_destroy);                        \
+  __macro(HcclCommDestroy);                     \
-  __macro(hcom_bind_model);                     \
+  __macro(HcclAllReduce);                       \
-  __macro(hcom_unbind_model);                   \
+  __macro(HcclCommInitRootInfo);                \
-  __macro(hcom_send);                           \
+  __macro(HcclGetRootInfo);                     \
-  __macro(hcom_receive);                        \
+  __macro(HcclBroadcast);                       \
-  __macro(hcom_broadcast);                      \
+  __macro(HcclCommInitClusterInfo);             \
-  __macro(hcom_all_gather);                     \
+  __macro(HcclAllGather);
-  __macro(hcom_all_reduce);                     \
-  __macro(hcom_reduce_scatter);                 \
-  __macro(hcom_create_group);                   \
-  __macro(hcom_destroy_group);                  \
-  __macro(hcom_get_rank_id);                    \
-  __macro(hcom_get_local_rank_id);              \
-  __macro(hcom_get_local_rank_size);            \
-  __macro(hcom_get_split_strategy);             \
-  __macro(hcom_set_split_strategy_by_size);     \
-  __macro(hcom_set_split_strategy_by_index);    \
-  __macro(hcom_get_group_rank_from_world_rank); \
-  __macro(hcom_get_world_rank_from_group_rank); 
 HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP)

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -40,7 +40,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "acl/acl.h"
-#include "paddle/fluid/platform/dynload/hcom.h"
+#include "hccl/hccl_types.h"
 #endif  // PADDLE_WITH_ASCEND_CL
 #include <fstream>
@@ -1013,7 +1013,7 @@ struct NPUStatusType {};
  }
 DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE);
-DEFINE_NPU_STATUS_TYPE(hcclResult_t, HCCL_SUCCESS);
+DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS);
 }  // namespace details
 inline std::string build_npu_error_msg(aclError stat) {
@@ -1022,7 +1022,7 @@ inline std::string build_npu_error_msg(aclError stat) {
  return sout.str();
 }
-inline std::string build_npu_error_msg(hcclResult_t stat) {
+inline std::string build_npu_error_msg(HcclResult stat) {
  std::ostringstream sout;
  sout << " HCCL error, the error code is : " << stat << ". ";
  return sout.str();

--- a/paddle/fluid/platform/hccl_helper.h
+++ b/paddle/fluid/platform/hccl_helper.h
--- a/python/paddle/distributed/fleet/ascend_utils.py
+++ b/python/paddle/distributed/fleet/ascend_utils.py
@@ -74,11 +74,11 @@ def _get_ascend_rankfile(rank_table_file_path):
        node_ips.append(server['server_id'])
        device_list = server['device']
        device_count = len(device_list)
    return node_ips, device_count
-def get_cloud_cluster(rank_table_file=None, 
+def get_cloud_cluster(rank_table_file=None,
-                    device_mode=DeviceMode.ASCEND_NPU, 
+                    device_mode=DeviceMode.ASCEND_NPU,
                    start_port=6070):
    """
    Args:
@@ -86,7 +86,7 @@ def get_cloud_cluster(rank_table_file=None,
    device_mode: DeviceMode(Int)
    start_port: the start port of current runtime env
    """
-    if rank_table_file: 
+    if rank_table_file:
        # multi trainers
        node_ips, device_count = _get_ascend_rankfile(rank_table_file)
        if len(node_ips) == 1:
@@ -106,7 +106,7 @@ def get_cloud_cluster(rank_table_file=None,
        node_ips = ["127.0.0.1"]
        node_ip = node_ips[0]
        device_count = 1
    devices_per_proc = [str(x) for x in range(device_count)]
    free_ports = [
        x for x in range(start_port, start_port + len(devices_per_proc))
@@ -117,4 +117,4 @@ def get_cloud_cluster(rank_table_file=None,
        trainer_endpoints.append(["%s:%d" % (ip, port) for port in free_ports])
    return get_cluster(node_ips, node_ip, trainer_endpoints,
                               device_mode, devices_per_proc)
\ No newline at end of file
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -151,17 +151,30 @@ class CollectiveHelper(object):
                    OP_ROLE_KEY: OpRole.Forward
                })
        elif core.is_compiled_with_npu():
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
            block.append_op(
-                type='c_comm_init_hcom',
+                type='c_gen_hccl_id',
                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
                outputs={},
                attrs={
-                    'nranks': nranks,
                    'rank': rank,
                    'ring_id': ring_id,
                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': [endpoint_to_index_map[e] for e in endpoints],
+                    'rank_ids': nranks,
                    OP_ROLE_KEY: OpRole.Forward
                })

--- a/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
@@ -108,19 +108,32 @@ class PipelineHelper(object):
                    OP_ROLE_KEY: OpRole.Forward,
                })
        elif core.is_compiled_with_npu():
-            endpoint_to_index_map = {
+            if rank == 0 and wait_port:
-                e: idx for idx, e in enumerate(endpoints)
+                wait_server_ready(other_endpoints)
-            }
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
            block.append_op(
-                type='c_comm_init_hcom',
+                type='c_gen_hccl_id',
                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    OP_ROLE_KEY: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
                outputs={},
                attrs={
-                    'nranks': nranks,
                    'rank': rank,
                    'ring_id': ring_id,
                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': [endpoint_to_index_map[e] for e in endpoints],
+                    'rank_ids': nranks,
                    OP_ROLE_KEY: OpRole.Forward
                })

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -318,12 +318,12 @@ def _set_expected_place(place):
 # TODO(zhiqiu): remove this function.
 def _var_base_to_np(var_base):
-    """	
+    """
-    convert VarBase tp numpy	
+    convert VarBase tp numpy
-    Args:	
+    Args:
-        var_base(VarBase) : the VarBase to convert	
+        var_base(VarBase) : the VarBase to convert
-    Returns (np.ndarray): the np.ndarray contain the value of VarBase	
+    Returns (np.ndarray): the np.ndarray contain the value of VarBase
    """
    warnings.warn(
@@ -413,7 +413,7 @@ def cuda_places(device_ids=None):
    If :code:`device_ids` is not None, it should be the device
    ids of GPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
    [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
    Parameters:
@@ -455,19 +455,19 @@ def xpu_places(device_ids=None):
    xpu places would be returned.
    If :code:`device_ids` is not None, it should be the device
    ids of XPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
    [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
    Parameters:
        device_ids (list or tuple of int, optional): list of XPU device ids.
    Returns:
        list of paddle.XPUPlace: Created XPU place list.
    Examples:
        .. code-block:: python
            import paddle
            import paddle.static as static
            paddle.enable_static()
            xpu_places = static.xpu_places()
    """
@@ -485,7 +485,7 @@ def cpu_places(device_count=None):
    This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
    If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
    If :code:`CPU_NUM` is not set, the default value is 1,
    i.e. CPU_NUM=1.
    :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -518,7 +518,7 @@ def cuda_pinned_places(device_count=None):
    This function creates a list of :code:`fluid.CUDAPinnedPlace` objects.
    If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
    If :code:`CPU_NUM` is not set, the default value is 1,
    i.e. CPU_NUM=1.
    :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -579,7 +579,7 @@ def name_scope(prefix=None):
    Generate hierarchical name prefix for the operators in Static Graph.
-    Note: 
+    Note:
        This should only used for debugging and visualization purpose.
        Don't use it for serious analysis such as graph/program transformations.
        Don't use it in dygraph, since it will cause memory leak.
@@ -604,7 +604,7 @@ def name_scope(prefix=None):
          with paddle.static.name_scope("s4"):
                g = f - 1
-          # Op are created in the default main program.  
+          # Op are created in the default main program.
          for op in paddle.static.default_main_program().block(0).ops:
              # elementwise_add is created in /s1/
              if op.type == 'elementwise_add':
@@ -1634,7 +1634,7 @@ class Variable(object):
    def clone(self):
        """
        Returns a new static Variable, which is the clone of the original static
-        Variable. It remains in the current graph, that is, the cloned Variable 
+        Variable. It remains in the current graph, that is, the cloned Variable
        provides gradient propagation. Calling ``out = tensor.clone()`` is same
        as ``out = assign(tensor)`` .
@@ -1685,7 +1685,7 @@ class Variable(object):
            key(str): Key for this information.
            value(object): The value associated to the key.
-        Returns: 
+        Returns:
            None
        """
        if not hasattr(self, "_info"):
@@ -1699,7 +1699,7 @@ class Variable(object):
        Args:
            key(str): Key for this information.
-        Returns: 
+        Returns:
            object
        """
        if hasattr(self, "_info") and key in self._info:
@@ -2053,7 +2053,7 @@ class Operator(object):
        'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
        'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
-        'gen_nccl_id', 'c_gen_nccl_id', 'c_comm_init', 'c_comm_init_hcom', 'c_sync_calc_stream',
+        'gen_nccl_id', 'c_gen_nccl_id', 'c_gen_hccl_id', 'c_comm_init', 'c_comm_init_hccl', 'c_sync_calc_stream',
        'c_sync_comm_stream', 'queue_generator', 'dequeue', 'enqueue',
        'heter_listen_and_serv'
    }
@@ -3043,7 +3043,7 @@ class Block(object):
    def _insert_op_without_sync(self, index, *args, **kwargs):
        """
-        Insert an Operator according to the giving arguments, 
+        Insert an Operator according to the giving arguments,
        without sync_with_cpp to meke the compilation faster.
        Args:
@@ -4455,8 +4455,8 @@ class Program(object):
    def clone(self, for_test=False):
        """
        .. note:::
-            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` . 
+            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` .
-            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` . 
+            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` .
            3. This API has no effect in Dygraph Mode.
        Create a new Program with forward content of original one when ``for_test=True``.
@@ -4684,8 +4684,8 @@ class Program(object):
    def _prune_with_input(self, feeded_var_names, targets):
        """
        Prune operators and variables which are not needed to generate
-        :code:`targets`. Prune operators and variables which are needed 
+        :code:`targets`. Prune operators and variables which are needed
-        to generate feeded_var 
+        to generate feeded_var
        Notes: This is a very low level API. Users should not use this API
        directly. This API is in flux and not stable.
@@ -4830,7 +4830,7 @@ class Program(object):
    def parse_from_string(binary_str):
        """
        .. note::
-            1. All information about parameters will be lost after serialization; 
+            1. All information about parameters will be lost after serialization;
            2. This API has no effect in Dygraph mode.
        Deserialize a Program from  `protobuf <https://en.wikipedia.org/wiki/Protocol_Buffers>`_  binary string.
@@ -4895,7 +4895,7 @@ class Program(object):
        The default random seed for random operators in Program. ``0`` means get
        the random seed from random device.
-        .. note:: 
+        .. note::
            It must be set before the operators have been added.
        Returns:
@@ -4933,7 +4933,7 @@ class Program(object):
        """
        The number of :ref:`api_guide_Block_en`  in this Program.
-        .. note:: 
+        .. note::
            This API has no effect in Dygraph mode.
        Returns:
@@ -5141,8 +5141,8 @@ class Program(object):
        Args:
            other(Program): Other program
            pruned_origin_block_id_map(dict{int:int}): A dict which maps the block id in program
-            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is 
+            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is
-            cloned from block 0 in other, etc. Default is None, which means default mapped, 
+            cloned from block 0 in other, etc. Default is None, which means default mapped,
            {0:0, 1:1,..., n:n}.
        Returns:
@@ -5261,7 +5261,7 @@ class Parameter(Variable):
            be applied on the parameter. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this parameter.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
            in optimizer. Default is True.
    """
@@ -5349,8 +5349,8 @@ class Parameter(Variable):
 class ParamBase(core.VarBase):
    """
-    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode). 
+    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode).
-    A ParamBase is a persistable Tensor, and will be updated by optimizers 
+    A ParamBase is a persistable Tensor, and will be updated by optimizers
    after each iteration.
    The training of a neural network is essentially the updating of
    its ParamBase.
@@ -5368,7 +5368,7 @@ class ParamBase(core.VarBase):
            be applied on the ParamBase. Default: None
        do_model_average(bool): True if the model average strategy will
            be applied on this ParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
            in optimizer. Default is True.
    """
@@ -5491,7 +5491,7 @@ def default_startup_program():
    Get default/global startup program.
    The :code:`paddle.nn` function will append the initialization operators into startup program.
-    The :code:`startup_program` will initialize the parameters by the OPs. 
+    The :code:`startup_program` will initialize the parameters by the OPs.
    This method will return the default or the current startup program. Users can use
    :ref:`api_paddle_fluid_framework_program_guard`  to switch :ref:`api_paddle_fluid_framework_Program` .
@@ -5499,7 +5499,7 @@ def default_startup_program():
    Returns:
        Program: current default startup program.
-    Returns type: 
+    Returns type:
    Examples:
        .. code-block:: python
@@ -5517,13 +5517,13 @@ def default_startup_program():
 def default_main_program():
    """
-    This API can be used to get ``default main program`` which store the 
+    This API can be used to get ``default main program`` which store the
    descriptions of Ops and tensors.
-    For example ``z = paddle.add(x, y)`` will create a new ``add`` 
+    For example ``z = paddle.add(x, y)`` will create a new ``add``
-    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` . 
+    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` .
-    The ``default main program`` is the default value for ``Program`` parameter in 
+    The ``default main program`` is the default value for ``Program`` parameter in
    a lot of APIs. For example, the :code:`Executor.run()` will execute the
    :code:`default_main_program` when the program is not specified.
@@ -5593,8 +5593,8 @@ def program_guard(main_program, startup_program=None):
    Args:
        main_program(Program): New main program inside ``with`` statement.
-        startup_program(Program, optional): New startup program inside ``with`` 
+        startup_program(Program, optional): New startup program inside ``with``
-            statement. :code:`None` means not changing startup program, 
+            statement. :code:`None` means not changing startup program,
            default_startup_program is still used.
            Default: None.

--- a/python/paddle/fluid/transpiler/collective.py
+++ b/python/paddle/fluid/transpiler/collective.py
@@ -131,19 +131,32 @@ class Collective(object):
                    self.op_role_key: OpRole.Forward
                })
        elif core.is_compiled_with_npu():
-            endpoint_to_index_map = {
+            if rank == 0 and wait_port:
-                e: idx for idx, e in enumerate(endpoints)
+                wait_server_ready(other_endpoints)
-            }
+            hccl_id_var = block.create_var(
+                name=unique_name.generate('hccl_id'),
+                persistable=True,
+                type=core.VarDesc.VarType.RAW)
+            endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
            block.append_op(
-                type='c_comm_init_hcom',
+                type='c_gen_hccl_id',
                inputs={},
+                outputs={'Out': hccl_id_var},
+                attrs={
+                    'rank': rank,
+                    'endpoint': current_endpoint,
+                    'other_endpoints': other_endpoints,
+                    self.op_role_key: OpRole.Forward
+                })
+            block.append_op(
+                type='c_comm_init_hccl',
+                inputs={'X': hccl_id_var},
                outputs={},
                attrs={
-                    'nranks': nranks,
                    'rank': rank,
                    'ring_id': ring_id,
                    'device_id': int(os.getenv("FLAGS_selected_npus")),
-                    'rank_ids': [endpoint_to_index_map[e] for e in endpoints],
+                    'rank_ids': nranks,
                    self.op_role_key: OpRole.Forward
                })

--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -162,19 +162,33 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
                'ring_id': 0,
            })
    elif core.is_compiled_with_npu():
-        endpoint_to_index_map = {
+        if rank == 0 and wait_port:
-            e: idx for idx, e in enumerate(endpoints)
+            wait_server_ready(other_endpoints)
-        }
+        hccl_id_var = block.create_var(
+            name=unique_name.generate('hccl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW)
+        endpoint_to_index_map = {e: idx for idx, e in enumerate(endpoints)}
        block.append_op(
-            type='c_comm_init_hcom',
+            type='c_gen_hccl_id',
            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+                OP_ROLE_KEY: OpRole.Forward
+            })
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
            outputs={},
            attrs={
-                'nranks': nranks,
                'rank': rank,
-                'ring_id': 0,
+                'ring_id': ring_id,
                'device_id': int(os.getenv("FLAGS_selected_npus")),
-                'rank_ids': [endpoint_to_index_map[e] for e in endpoints],
+                'rank_ids': nranks,
+                OP_ROLE_KEY: OpRole.Forward
            })
@@ -852,26 +866,26 @@ class Model(object):
          import paddle.nn as nn
          import paddle.vision.transforms as T
          from paddle.static import InputSpec
          device = paddle.set_device('cpu') # or 'gpu'
          net = nn.Sequential(
              nn.Flatten(1),
              nn.Linear(784, 200),
              nn.Tanh(),
              nn.Linear(200, 10))
          # inputs and labels are not required for dynamic graph.
          input = InputSpec([None, 784], 'float32', 'x')
          label = InputSpec([None, 1], 'int64', 'label')
          model = paddle.Model(net, input, label)
          optim = paddle.optimizer.SGD(learning_rate=1e-3,
              parameters=model.parameters())
          model.prepare(optim,
                        paddle.nn.CrossEntropyLoss(),
                        paddle.metric.Accuracy())
          transform = T.Compose([
              T.Transpose(),
              T.Normalize([127.5], [127.5])
@@ -914,12 +928,12 @@ class Model(object):
        Run one training step on a batch of data.
        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be 
+            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
-                (in case the model has multiple labels). If has no labels, 
+                (in case the model has multiple labels). If has no labels,
                set None. Default is None.
        Returns:
@@ -930,7 +944,7 @@ class Model(object):
        Examples:
            .. code-block:: python
              import numpy as np
              import paddle
              import paddle.nn as nn
@@ -965,12 +979,12 @@ class Model(object):
        Run one evaluating step on a batch of data.
        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be 
+            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
-                (in case the model has multiple labels). If has no labels, 
+                (in case the model has multiple labels). If has no labels,
                set None. Default is None.
        Returns:
@@ -981,7 +995,7 @@ class Model(object):
        Examples:
            .. code-block:: python
              import numpy as np
              import paddle
              import paddle.nn as nn
@@ -1017,8 +1031,8 @@ class Model(object):
        Run one predicting step on a batch of data.
        Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+                be a numpy array or paddle.Tensor, or a list of arrays or
                tensors (in case the model has multiple inputs).
        Returns:
@@ -1028,14 +1042,14 @@ class Model(object):
        Examples:
            .. code-block:: python
              import numpy as np
              import paddle
              import paddle.nn as nn
              from paddle.static import InputSpec
              device = paddle.set_device('cpu') # or 'gpu'
              input = InputSpec([None, 784], 'float32', 'x')
              label = InputSpec([None, 1], 'int64', 'label')
@@ -1057,12 +1071,12 @@ class Model(object):
        return loss
    def save(self, path, training=True):
-        """  
+        """
-        This function saves parameters, optimizer information or model and 
+        This function saves parameters, optimizer information or model and
        paramters only for inference to path. It depends on the parameter
        `training`.
-        If `training` is set to True, the parameters saved contain all 
+        If `training` is set to True, the parameters saved contain all
        the trainable Variable, will save to a file with suffix ".pdparams".
        The optimizer information contains all the variable used by optimizer.
        For Adam optimizer, contains beta1, beta2, momentum etc. All the
@@ -1115,13 +1129,13 @@ class Model(object):
                optim = paddle.optimizer.SGD(learning_rate=1e-3,
                    parameters=model.parameters())
                model.prepare(optim, paddle.nn.CrossEntropyLoss())
                transform = T.Compose([
                    T.Transpose(),
                    T.Normalize([127.5], [127.5])
                ])
                data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
                model.fit(data, epochs=1, batch_size=32, verbose=0)
                model.save('checkpoint/test')  # save for training
                model.save('inference_model', False)  # save for inference
@@ -1165,7 +1179,7 @@ class Model(object):
        Examples:
            .. code-block:: python
              import paddle
              import paddle.nn as nn
              from paddle.static import InputSpec
@@ -1247,7 +1261,7 @@ class Model(object):
              from paddle.static import InputSpec
              input = InputSpec([None, 784], 'float32', 'x')
              model = paddle.Model(nn.Sequential(
                  nn.Linear(784, 200),
                  nn.Tanh(),
@@ -1332,12 +1346,12 @@ class Model(object):
        evaluation will be done at the end of each epoch.
        Args:
-            train_data (Dataset|DataLoader): An iterable data loader is used for 
+            train_data (Dataset|DataLoader): An iterable data loader is used for
-                train. An instance of paddle paddle.io.Dataset or 
+                train. An instance of paddle paddle.io.Dataset or
                paddle.io.Dataloader is recomended. Default: None.
            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
+                evaluation at the end of epoch. If None, will not do evaluation.
-                An instance of paddle.io.Dataset or paddle.io.Dataloader 
+                An instance of paddle.io.Dataset or paddle.io.Dataloader
                is recomended. Default: None.
            batch_size (int): Integer number. The batch size of train_data
                and eval_data. When train_data and eval_data are both the
@@ -1394,10 +1408,10 @@ class Model(object):
              ])
              train_dataset = MNIST(mode='train', transform=transform)
              val_dataset = MNIST(mode='test', transform=transform)
              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
              label = InputSpec([None, 1], 'int64', 'label')
              model = paddle.Model(
                  paddle.vision.models.LeNet(),
                  input, label)
@@ -1426,7 +1440,7 @@ class Model(object):
              dynamic = True
              if not dynamic:
                  paddle.enable_static()
              transform = T.Compose([
                    T.Transpose(),
                    T.Normalize([127.5], [127.5])
@@ -1437,10 +1451,10 @@ class Model(object):
              val_dataset = MNIST(mode='test', transform=transform)
              val_loader = paddle.io.DataLoader(val_dataset,
                  batch_size=64)
              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
              label = InputSpec([None, 1], 'int64', 'label')
              model = paddle.Model(
                  paddle.vision.models.LeNet(), input, label)
              optim = paddle.optimizer.Adam(
@@ -1541,7 +1555,7 @@ class Model(object):
        Args:
            eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.io.Dataset or 
+                evaluation. An instance of paddle.io.Dataset or
                paddle.io.Dataloader is recomended.
            batch_size (int): Integer number. The batch size of train_data
                and eval_data.  When eval_data is the instance of Dataloader,
@@ -1638,7 +1652,7 @@ class Model(object):
            batch_size (int): Integer number. The batch size of train_data and eval_data.
                When train_data and eval_data are both the instance of Dataloader, this
                argument will be ignored. Default: 1.
-            num_workers (int): The number of subprocess to load data, 0 for no subprocess 
+            num_workers (int): The number of subprocess to load data, 0 for no subprocess
                used and loading data in main process. When train_data and eval_data are
                both the instance of Dataloader, this argument will be ignored. Default: 0.
            stack_outputs (bool): Whether stack output field like a batch, as for an output
@@ -1857,10 +1871,10 @@ class Model(object):
        """Prints a string summary of the network.
        Args:
-            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
-                    if not set, input_size will get from ``self._inputs`` if network only have 
+                    if not set, input_size will get from ``self._inputs`` if network only have
-                    one input, input_size can be tuple or InputSpec. if model have multiple 
+                    one input, input_size can be tuple or InputSpec. if model have multiple
-                    input, input_size must be a list which contain every input's shape. 
+                    input, input_size must be a list which contain every input's shape.
                    Default: None.
            dtypes (str, optional): if dtypes is None, 'float32' will be used, Default: None.
@@ -1872,10 +1886,10 @@ class Model(object):
              import paddle
              from paddle.static import InputSpec
              input = InputSpec([None, 1, 28, 28], 'float32', 'image')
              label = InputSpec([None, 1], 'int64', 'label')
              model = paddle.Model(paddle.vision.LeNet(),
                  input, label)
              optim = paddle.optimizer.Adam(