From 15823bb0df42e0c1da7e9b45dabbfb71525f6082 Mon Sep 17 00:00:00 2001
From: lw921014 <liuwei921014@yeah.net>
Date: Mon, 8 Mar 2021 16:35:09 +0800
Subject: [PATCH] [NPU] add npu kernel for communication op (#31437)

* add allreduce and broadcast without test

* add c_broadcast_test case

* build c_comm_init and c_create_group operators

* make the whole thing compile

* add broadcast and init op test case but run failed

* make unit test compile

* fix broadcast test bug and change into hcom for ccl

* change c_comm_init and c_create_group ops accordingly

* make tests compile

* transfer code to 27

* compiled successfully in 28, but run failed

* test broadcast in 28, but failed

* make hcom primitives work

* change hccl data type for base.h

* fix broadcast bug

* make attributes work

* fix group name bug

* add allreduce but test failed

* allreduce bug for qiuliang

* allreduce finished

* add allgather and reducescatter

* merge all op code

* add allgather test

* finish run all ccl op test exclude send/recv

* all all op and test exclude send/recv

* send_v2_npu.cc recv_v2_npiu.cc compiled

* fix ccl core dump bug and test allgather, reducescatter, broadcast op

* fix allreduce bug just for test

* hcom send&recv test pass, without hcom_destroy

* for qiuliang test

* Ascend Send&Recv Test Pass

* all op (ex send/recv) ok

* fix bug

* merge all ccl op

* style merge to PaddlePaddle

* merge style

* new merge style

* merge style 2

* insert an empty at the end

* disable ctest for hcom to pass ci

Co-authored-by: void-main <voidmain1313113@gmail.com>
Co-authored-by: f2hkop <f2huestc@outlook.com>
---
 cmake/generic.cmake                           |  18 ++-
 .../fluid/operators/collective/CMakeLists.txt |  21 ++-
 .../operators/collective/c_allgather_op.cc    |   4 +
 .../collective/c_allgather_op_npu.cc          |  86 ++++++++++
 .../collective/c_allgather_op_npu_test.cc     | 149 ++++++++++++++++++
 .../collective/c_allreduce_max_op_npu.cc      |   4 +-
 .../collective/c_allreduce_max_op_npu_test.cc | 144 +++++++++++++++++
 .../collective/c_allreduce_min_op_npu.cc      |   4 +-
 .../operators/collective/c_allreduce_op.h     |  58 ++++---
 .../collective/c_allreduce_prod_op_npu.cc     |   4 +-
 .../collective/c_allreduce_sum_op_npu.cc      |   4 +-
 .../collective/c_allreduce_sum_op_npu_test.cc | 143 +++++++++++++++++
 .../operators/collective/c_broadcast_op.cc    |   1 -
 .../collective/c_broadcast_op_npu.cc          |  38 +++--
 ...npu_test.cc => c_broadcast_op_npu_test.cc} | 108 ++++---------
 .../collective/c_reducescatter_op.cc          |   4 +
 .../operators/collective/c_reducescatter_op.h |   3 +
 .../collective/c_reducescatter_op_npu.cc      |  89 +++++++++++
 .../collective/c_reducescatter_op_npu_test.cc | 144 +++++++++++++++++
 .../fluid/operators/collective/recv_v2_op.cc  |   6 +
 .../operators/collective/recv_v2_op_npu.cc    |  73 +++++++++
 .../collective/recv_v2_op_npu_test.cc         | 122 ++++++++++++++
 .../fluid/operators/collective/send_v2_op.cc  |   6 +
 .../operators/collective/send_v2_op_npu.cc    |  74 +++++++++
 .../collective/send_v2_op_npu_test.cc         | 109 +++++++++++++
 paddle/fluid/platform/ascend_npu_info.cc      |   2 -
 paddle/fluid/platform/collective_helper.h     |   5 +
 .../fluid/platform/collective_helper_npu.cc   |   2 +-
 paddle/fluid/platform/dynload/hcom.h          |   2 +-
 .../platform/dynload/{base.h => hcom_type.h}  |   0
 30 files changed, 1289 insertions(+), 138 deletions(-)
 create mode 100644 paddle/fluid/operators/collective/c_allgather_op_npu.cc
 create mode 100644 paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
 create mode 100644 paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
 create mode 100644 paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
 rename paddle/fluid/operators/collective/{c_hcom_op_npu_test.cc => c_broadcast_op_npu_test.cc} (61%)
 create mode 100644 paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
 create mode 100644 paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
 create mode 100644 paddle/fluid/operators/collective/recv_v2_op_npu.cc
 create mode 100644 paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
 create mode 100644 paddle/fluid/operators/collective/send_v2_op_npu.cc
 create mode 100644 paddle/fluid/operators/collective/send_v2_op_npu_test.cc
 rename paddle/fluid/platform/dynload/{base.h => hcom_type.h} (100%)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 391f60ab56f..7356f7fc21c 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -440,9 +440,19 @@ function(cc_test TARGET_NAME)
     cc_test_build(${TARGET_NAME}
 	    SRCS ${cc_test_SRCS}
 	    DEPS ${cc_test_DEPS})
-    cc_test_run(${TARGET_NAME}
-	    COMMAND ${TARGET_NAME}
-	    ARGS ${cc_test_ARGS})
+    # we dont test hcom op, because it need complex configuration
+    # with more than one machine
+    if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test"     OR
+            "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test"         OR
+            "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test"             OR
+            "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test"))
+      cc_test_run(${TARGET_NAME}
+        COMMAND ${TARGET_NAME}
+        ARGS ${cc_test_ARGS})
+    endif()
   endif()
 endfunction(cc_test)
 
@@ -859,7 +869,7 @@ function(py_test TARGET_NAME)
                ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
-    
+
     if (WIN32)
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
     endif()
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index b2405b60585..6df7bd3df56 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -30,10 +30,27 @@ if(WITH_XPU_BKCL)
 endif()
 
 if(WITH_ASCEND_CL)
-    set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
+set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
 endif()
 
 set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE)
 set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency")
 
-cc_test(c_hcom_op_npu_test SRCS c_hcom_op_npu_test.cc DEPS op_registry c_broadcast_op c_allreduce_sum_op c_comm_init_hcom_op ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor)
+if(WITH_ASCEND_CL)
+    set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hcom_op op_registry ascend_hccl flags
+        dynamic_loader dynload_warpctc scope device_context enforce executor)
+    cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc
+        DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc
+        DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc
+        DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc
+        DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc
+        DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc
+        DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+    cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc
+        DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM})
+endif()
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index 4111a19c5eb..c4e779698cc 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "(Tensor) the allgather result");
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for all gather.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
new file mode 100644
index 00000000000..2ff1227f307
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+
+#include <memory>
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CAllGatherOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    std::string tag = ctx.Attr<std::string>("tag");
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    framework::DDim out_dims = in->dims();
+    out_dims[0] *= nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    int64_t send_numel = in->numel();
+    void *send_buff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void *recv_buff = reinterpret_cast<void*>(out->data<T>());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    VLOG(3) << "begin hccl allgather, parameter is: "
+      << ", group is " << group
+      << ", ring_id is " << ring_id
+      << ", nranks is " << nranks
+      << ", tag is " << tag;
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_gather(
+        tag.c_str(), send_buff, recv_buff, (u64)send_numel, dtype,
+        group.c_str(), (void*)stream));
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_allgather,
+                        ops::CAllGatherOpASCENDKernel<int8_t>,
+                        ops::CAllGatherOpASCENDKernel<int>,
+                        ops::CAllGatherOpASCENDKernel<float>,
+                        ops::CAllGatherOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
new file mode 100644
index 00000000000..c8f06eab2d8
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allgather);
+USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_OP_DEVICE_KERNEL(c_allgather, NPU);
+
+DECLARE_string(selected_npus);
+
+template<typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id  
+  << "; rank_id = " << rank_id  
+  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));  
+  
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+  
+  int num1 = 1;
+  int num2 = 4;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"]=std::string("tagx");
+  attrs["ring_id"]=0;
+  attrs["nranks"]=2;
+  
+  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+  
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size() * 2);
+  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+
+TEST(c_allgather, NPU) {
+  f::Scope scope;
+
+  // only support one device, if more than one device, use first default  
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  Prepare(&scope, ctx);
+  TestHCCLAllGatherOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
index bf57b99e8b2..30fb7b16fc4 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_NPU_KERNEL(c_allreduce_max, 
-    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
     ops::CAllReduceOpASCENDKernel<ops::kRedMax, int>,
     ops::CAllReduceOpASCENDKernel<ops::kRedMax, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMax, float>,
     ops::CAllReduceOpASCENDKernel<ops::kRedMax, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
new file mode 100644
index 00000000000..3631442a69e
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_max);
+USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
+
+DECLARE_string(selected_npus);
+
+template<typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl <<debugstring;
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id
+  << "; rank_id = " << rank_id
+  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 100;
+  int num2 = 100;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id * 3);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"]=std::string("tagx");
+  attrs["ring_id"]=0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"X"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 4.0);
+  }
+}
+
+TEST(c_allreduce_max, NPU) {
+  f::Scope scope;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  Prepare(&scope, ctx);
+  TestHCCLAllReduceOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
index f8edace54e7..bedd1664990 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_NPU_KERNEL(c_allreduce_min, 
-    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
     ops::CAllReduceOpASCENDKernel<ops::kRedMin, int>,
     ops::CAllReduceOpASCENDKernel<ops::kRedMin, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedMin, float>,
     ops::CAllReduceOpASCENDKernel<ops::kRedMin, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index 27cdee5a982..b990e18f6ff 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -19,6 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/memory/memory.h"
 
 #if defined(PADDLE_WITH_NCCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -115,36 +117,50 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
 #if defined(PADDLE_WITH_ASCEND_CL)
+
+    // we need to pre-allocate 512 Bytes before the data 
+    // and 512 Bytes after the data, so the hccl allreduce
+    // can work. This is a must acooding to huawei peer.
+    #define PRE_MALLOC_SIZE_BYTES 512
+
     auto in = ctx.Input<framework::LoDTensor>("X");
     auto out = ctx.Output<framework::LoDTensor>("Out");
-
     auto place = ctx.GetPlace();
     hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
-
     int64_t numel = in->numel();
-    void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
-    // void* sendbuff = reinterpret_cast<void*>(const_cast<T*>(in->mutable_data<T>(place)));
 
-    out->Resize(in->dims());
-    // void* recvbuff = reinterpret_cast<void*>(const_cast<T*>(out->data<T>()));
-    void* recvbuff = reinterpret_cast<void*>(const_cast<T*>(out->mutable_data<T>(place)));
-    // void* recvbuff = sendbuff;
+    int64_t pre_tmp_size = PRE_MALLOC_SIZE_BYTES / sizeof(T);
+    int64_t tmp_numel = numel + pre_tmp_size * 2;
+
+    paddle::framework::LoDTensor tmp_in, tmp_out;
+    tmp_in.Resize({tmp_numel});
+    tmp_out.Resize({tmp_numel});
+    tmp_in.mutable_data<T>(place);  // allocate
+    tmp_out.mutable_data<T>(place);  // allocate
+
+    void* sendbuff = reinterpret_cast<void*>(tmp_in.data<T>() + pre_tmp_size);
+    void* recvbuff = reinterpret_cast<void*>(tmp_out.data<T>() + pre_tmp_size);
+
     std::string tag = ctx.Attr<std::string>("tag");
     int ring_id = ctx.Attr<int>("ring_id");
-    // s他的：
     std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-     group = "hccl_world_group";// std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
-
     auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
 
     aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
 
+    auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place);
+
+    memory::Copy(npu_place, sendbuff, 
+                 npu_place, reinterpret_cast<void*>(const_cast<T*>(in->data<T>())),
+                 numel * sizeof(T),
+                 stream);
+
     hcclRedOp_t hccl_red_type = HCCL_REP_OP_SUM;
     switch (red_type) {
       case kRedSum:
@@ -168,7 +184,6 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
             "Invalid reduce type: %d", red_type));
     }
 
-
     VLOG(3) << "begin hccl allreduce, parameter is: "
       << "input num: " << numel
       << "dtype: " << dtype
@@ -176,18 +191,18 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel<T> {
       << ", group is: " << group
       << ", tag is " << tag;
 
-    printf("sendbuff: %p\n", sendbuff);
-    printf("recvbuff: %p\n", recvbuff);
-
-    // printf("sendbuff: %p, %d\n", sendbuff, ((int*)sendbuff)[0]);
-    // printf("recvbuff: %p, %d\n", recvbuff, ((int*)recvbuff)[0]);
-
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_all_reduce(
         tag.c_str(), sendbuff, recvbuff, numel, dtype, hccl_red_type, group.c_str(), (void*)stream));
 
+    memory::Copy(npu_place, reinterpret_cast<void*>(out->data<T>()),
+                 npu_place, recvbuff, 
+                 numel * sizeof(T),
+                 stream);
+    
+    out->Resize(in->dims());
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
+        "PaddlePaddle should compile with NPU."));
 #endif
   }
 };
@@ -242,7 +257,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
     }
 
     PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, dtype, nccl_red_type, comm->comm(), stream));
+        sendbuff, recvbuff, (u64)numel, dtype, nccl_red_type, comm->comm(), stream));
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with GPU."));
@@ -258,7 +273,6 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) communication ring id.")
         .SetDefault(0);
 #if defined(PADDLE_WITH_ASCEND_CL)
-    #pragma message("hccl CAllReduceOpMaker need tag attr")
     AddAttr<std::string>("tag", "(string default tag) tag for all reduce.")
         .SetDefault("tag");
 #endif
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
index f59723ed40e..94eed65687f 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_NPU_KERNEL(c_allreduce_prod, 
-    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
     ops::CAllReduceOpASCENDKernel<ops::kRedProd, int>,
     ops::CAllReduceOpASCENDKernel<ops::kRedProd, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedProd, float>,
     ops::CAllReduceOpASCENDKernel<ops::kRedProd, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
index 6554976b1c8..a352cb4e3ec 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_NPU_KERNEL(c_allreduce_sum, 
-    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
     ops::CAllReduceOpASCENDKernel<ops::kRedSum, int>,
     ops::CAllReduceOpASCENDKernel<ops::kRedSum, int8_t>,
+    ops::CAllReduceOpASCENDKernel<ops::kRedSum, float>,
     ops::CAllReduceOpASCENDKernel<ops::kRedSum, plat::float16>)
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
new file mode 100644
index 00000000000..6e7daf512ed
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_allreduce_sum);
+USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template<typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id  
+  << "; rank_id = " << rank_id  
+  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));  
+  
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"]=std::string("tagx");
+  attrs["ring_id"]=0;
+
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", 
+                                    {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, 
+                                    attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 3.0);
+  }
+}
+
+TEST(c_allreduce_sum, NPU) {
+  f::Scope scope;
+
+  // only support one device, if more than one device, use first default  
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  Prepare(&scope, ctx);
+  TestHCCLAllReduceOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc
index 05b4b13d96b..271d543eb23 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cc
@@ -43,7 +43,6 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("root", "(int default 0) root id for broadcasting.")
         .SetDefault(0);
 #if defined(PADDLE_WITH_ASCEND_CL)
-    #pragma message("tag")
     AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
         .SetDefault("tag");
 #endif
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
index c2c049b3a91..d826411ddc2 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -39,8 +39,8 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
     auto comm = paddle::platform::HCCLCommContext::Instance().Get(ring_id, place);
 
     aclrtStream stream = nullptr;
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
     if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
@@ -54,29 +54,27 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
       << ", group is " << group
       << ", tag is " << tag;
 
-    if (root == static_cast<int>(comm->rank())) {
-      PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
-                                   dtype, (uint32_t)root, group.c_str(), (void*)stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. sent "
-              << x->numel();
-    } else {
-      PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
-                                    dtype, (uint32_t)root, group.c_str(), (void*)stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
-              << framework::product(out->dims());
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_broadcast(tag.c_str(), ptr, numel,
+                                  dtype, (uint32_t)root, group.c_str(), (void*)stream));
+
+    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+            << framework::product(out->dims());
+
+    dev_ctx->Wait();
+
+    if (out != x) {
+      framework::TensorCopy(
+          *static_cast<const framework::Tensor*>(x), place,
+          *platform::DeviceContextPool::Instance().Get(place),
+          static_cast<framework::Tensor*>(out));
     }
-      if (out != x) {
-        framework::TensorCopy(
-            *static_cast<const framework::Tensor*>(x), place,
-            *platform::DeviceContextPool::Instance().Get(place),
-            static_cast<framework::Tensor*>(out));
-      }
+    dev_ctx->Wait();
 
     out->Resize(x->dims());
     out->set_lod(x->lod());
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
-        "PaddlePaddle should compile with GPU."));
+        "PaddlePaddle should compile with NPU."));
 #endif
   }
 };
@@ -88,7 +86,7 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 REGISTER_OP_NPU_KERNEL(c_broadcast,
-                        ops::CBroadcastOpASCENDKernel<float>,
                         ops::CBroadcastOpASCENDKernel<int>,
                         ops::CBroadcastOpASCENDKernel<int8_t>,
+                        ops::CBroadcastOpASCENDKernel<float>,
                         ops::CBroadcastOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_hcom_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
similarity index 61%
rename from paddle/fluid/operators/collective/c_hcom_op_npu_test.cc
rename to paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 643300158f4..65045bce757 100644
--- a/paddle/fluid/operators/collective/c_hcom_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -16,21 +16,21 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
-#include <stdio.h>
-
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
+#include <stdio.h>
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
+
+#include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/string/printf.h"
+
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -42,18 +42,30 @@ namespace p = paddle::platform;
 namespace m = paddle::operators::math;
 
 USE_OP(c_broadcast);
-USE_OP(c_allreduce_sum);
 USE_NO_KERNEL_OP(c_comm_init_hcom);
 USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
-USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template<typename T>
+void PrintDebugInfo(const std::string preStr, const  std::vector<T> &data){
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+}
 
 void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
 
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
-  printf("rank_id = %d, device_id = %d\n", rank_id, device_id);
-
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id  
+  << "; rank_id = " << rank_id  
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));  
+  
   std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
@@ -67,24 +79,22 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
   comm_init_op->Run(*scope, place);
   ctx.Wait();
 }
+
 void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout<< "BEGIN TEST:" << __FUNCTION__ <<std::endl;
   // init
   auto x = scope->Var("X");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
   int num = 2;
   std::vector<float> init;
   int rank_id = atoi(getenv("RANK_ID"));
-  std::cout<< "rank_id:" << rank_id<<std::endl;
+  
   for (int64_t i = 0; i < num * num; ++i) {
     init.push_back(1.0 + rank_id);
-    std::cout<< init[0];
   }
-  std::cout<<std::endl;
+  PrintDebugInfo("input data", init);
 
   TensorFromVector(init, ctx, tensor_x);
   tensor_x->Resize({num, num});
-
   ctx.Wait();
 
   auto place = ctx.GetPlace();
@@ -92,7 +102,6 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   auto tensor_out = out->GetMutable<f::LoDTensor>();
   tensor_out->Resize({num, num});
   tensor_out->mutable_data<float>(place);  // allocate
-
   ctx.Wait();
 
   // run
@@ -101,84 +110,29 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   attrs["root"]=0;
   attrs["ring_id"]=0;
 
-  auto op =
-      f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
+  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
                               {{"Out", {"Out"}}}, attrs);
 
   op->Run(*scope, place);
-
+  ctx.Wait();
+  
   std::vector<float> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
-
   ctx.Wait();
 
+  PrintDebugInfo("output data", out_vec);
   EXPECT_EQ(out_vec.size(), init.size());
   for (uint32_t i = 0; i < out_vec.size(); i++) {
     EXPECT_EQ(out_vec[i], 1.0);
   }
 }
 
-void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  std::cout<< "BEGIN TEST:" << __FUNCTION__ <<std::endl;
-  // init
-  auto x = scope->Var("X");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-  std::cout<< "rank_id:" << rank_id<<std::endl;
-
-  int num1 = 1;
-  int num2 = 4;
-
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0);
-    // init.push_back(1.0 + rank_id * 3);
-    std::cout<< init[0];
-  }
-  std::cout<<std::endl;
-
-  TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("Out");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
-  attrs["ring_id"]=0;
-
-  auto op =
-      f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
-
-  op->Run(*scope, place);
-
-  std::vector<float> out_vec;
-  TensorToVector(*tensor_out, ctx, &out_vec);
-
-  ctx.Wait();
-
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 2.0);
-  }
-}
 TEST(c_broadcast, NPU) {
   f::Scope scope;
-  char * npu_id=getenv("FLAGS_selected_npus");
 
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+  // only support one device, if more than one device, use first default  
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
   Prepare(&scope, ctx);
   TestHCCLBroadcastOp(&scope, ctx);
-  // TestHCCLAllReduceOp(&scope, ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc
index ada1fd2b127..7836f11dc9b 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc
@@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("nranks",
                  "Total trainer count of the distributed training job")
         .SetDefault(1);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for reduce scatter.")
+        .SetDefault("tag");
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h
index 366d8a3747c..e81e5003f5a 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.h
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.h
@@ -22,6 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+
+
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
new file mode 100644
index 00000000000..4658647ac94
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CReduceScatterOpAscendKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto in = ctx.Input<framework::Tensor>("X");
+    auto out = ctx.Output<framework::Tensor>("Out");
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    std::string tag = ctx.Attr<std::string>("tag");
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+    int nranks = comm->nranks();
+
+    auto out_dims = in->dims();
+    PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0,
+                      platform::errors::InvalidArgument(
+                          "The input tensor X's "
+                          "dim[0] (%d) should be divisible by nranks(%d)",
+                          out_dims[0], nranks));
+    
+    out_dims[0] = out_dims[0] / nranks;
+    out->mutable_data<T>(out_dims, place);
+
+    int64_t recv_numel = in->numel() / nranks;
+
+    void* inputPtr = reinterpret_cast<void*>(const_cast<T*>(in->data<T>()));
+    void* outputPtr = reinterpret_cast<void*>(out->data<T>());
+    hcclDataType_t dtype = platform::ToHCCLDataType(in->type());
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    VLOG(3) << "begin hccl reduce scatter, parameter is: "
+      << "recv_numel: " << recv_numel
+      << "dtype: " << dtype
+      << "hccl_red_type: " << HCCL_REP_OP_SUM
+      << ", group is: " << group
+      << ", tag is " << tag;
+    
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_reduce_scatter(
+        tag.c_str(), inputPtr, outputPtr, (u64)recv_numel, dtype, HCCL_REP_OP_SUM, group.c_str(), (void*)stream));
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(c_reducescatter, 
+                        ops::CReduceScatterOpAscendKernel<int8_t>,
+                        ops::CReduceScatterOpAscendKernel<int>,
+                        ops::CReduceScatterOpAscendKernel<float>,
+                        ops::CReduceScatterOpAscendKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
new file mode 100644
index 00000000000..c04dee5b692
--- /dev/null
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(c_reducescatter);
+USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
+
+DECLARE_string(selected_npus);
+
+template<typename T>
+void PrintDebugInfo(const std::string preStr, const  std::vector<T> &data){
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+}
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id
+  << "; device_id = " << device_id  
+  << "; rank_id = " << rank_id  
+  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));  
+  
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int num1 = 4;
+  int num2 = 1;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0);
+  }
+  PrintDebugInfo("input data", init);
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"]=std::string("tagx");
+  attrs["ring_id"]=0;
+  attrs["nranks"]=2;
+
+  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"X"}}},
+                              {{"Out", {"Out"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+  
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size() / 2);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_reducescatter, NPU) {
+  f::Scope scope;
+
+  // only support one device, if more than one device, use first default  
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  Prepare(&scope, ctx);
+  TestHCCLReduceScatterOp(&scope, ctx);
+}
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
index 10408820387..d65e3fdbb2f 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -62,6 +62,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
     AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
         .SetDefault(5);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
         .SetDefault(std::vector<int>());
     AddAttr<bool>(
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
new file mode 100644
index 00000000000..8d49c6e278e
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CRecvOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    int numel = out->numel();
+    hcclDataType_t dtype = platform::ToHCCLDataType(out->type());
+
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto place = ctx.GetPlace();
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    std::string tag = ctx.Attr<std::string>("tag");
+    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    int srcRank = ctx.Attr<int>("peer");
+    int srTag = ctx.Attr<int>("srTag");
+    VLOG(3) << "recv_v2_npu attr get";
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_receive(
+        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(out->data<T>())), (u64)numel, dtype, srcRank,
+          srTag, group.c_str(), stream));
+     VLOG(3) << "Source Rank: " << srcRank << " Invoke hcom receive. receiving ";
+    out->Resize(out->dims());
+    out->set_lod(out->lod());
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(recv_v2, 
+                        ops::CRecvOpASCENDKernel<int>,
+                        ops::CRecvOpASCENDKernel<int8_t>,
+                        ops::CRecvOpASCENDKernel<float>,
+                        ops::CRecvOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
new file mode 100644
index 00000000000..22492445a33
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include <stdio.h>
+
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(recv_v2);
+USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_OP_DEVICE_KERNEL(recv_v2, NPU);
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+    std::string rank_table_file = getenv("RANK_TABLE_FILE");
+    int rank_id = atoi(getenv("RANK_ID"));
+    int device_id = atoi(getenv("DEVICE_ID"));
+    int src_rank = atoi(getenv("SRC_RANK"));
+    int dest_rank = atoi(getenv("DEST_RANK"));
+    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
+
+    std::vector<int> rank_ids = {0,1};
+    f::AttributeMap comm_init_attrs;
+    comm_init_attrs["ring_id"] = 0;
+    comm_init_attrs["nranks"] = 2;
+    comm_init_attrs["rank"] = rank_id;
+    comm_init_attrs["device_id"] = device_id;
+    comm_init_attrs["rank_ids"] = rank_ids;
+    auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+    VLOG(3) << "CreateOp c_comm_init_hcom";
+    auto place = ctx.GetPlace();
+    comm_init_op->Run(*scope, place);
+    ctx.Wait();
+}
+
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
+    std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+
+    int num = atoi(getenv("DATA_SIZE"));
+    EXPECT_GT(num, 0);
+    EXPECT_LT(num, 1 << 15);
+    int rank_id = atoi(getenv("RANK_ID"));
+    VLOG(3) << "rank_id:" << rank_id<<std::endl;
+
+    ctx.Wait();
+    auto place = ctx.GetPlace();
+    auto out = scope->Var("Out");
+    auto tensor_out = out->GetMutable<f::LoDTensor>();
+    tensor_out->Resize({num, num});
+    tensor_out->mutable_data<float>(place);  // allocate
+
+    ctx.Wait();
+
+    f::AttributeMap attrs;
+    attrs["tag"]=std::string("srtest");
+    attrs["peer"]=atoi(getenv("SRC_RANK"));
+    attrs["ring_id"]=0;
+    attrs["srTag"]=0;
+    std::vector<int> out_shape;
+    out_shape.push_back(num);
+    out_shape.push_back(num);
+    attrs["out_shape"]=out_shape;
+
+    auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
+    VLOG(3) << "CreateOp recv_v2";
+
+    op->Run(*scope, place);
+    VLOG(3) << "Run op recv_v2";
+    std::vector<float> out_vec;
+    TensorToVector(*tensor_out, ctx, &out_vec);
+    ctx.Wait();
+    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+    EXPECT_EQ(out_vec == init, true);
+}
+
+
+TEST(recv_v2, NPU){
+    f::Scope scope;
+    char * npu_id=getenv("FLAGS_selected_npus");
+    VLOG(3) << "Select npu:" << npu_id;
+    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+    VLOG(3) << "Place over";
+    Prepare(&scope, ctx);
+    VLOG(3) << "Prepare over";
+    TestHcomRecvOp(&scope, ctx);
+    VLOG(3) << "Test over";
+}
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
index c5a86b4f088..c60d560e43b 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
         .SetDefault(0);
     AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
+#if defined(PADDLE_WITH_ASCEND_CL)
+    AddAttr<std::string>("tag", "(string default tag) tag for broadcasting.")
+        .SetDefault("tag");
+    AddAttr<int>("srTag", "(string default tag) tag for broadcasting.")
+        .SetDefault(0);
+#endif
     AddAttr<bool>(
         "use_calc_stream",
         "(bool default false) eject CUDA operations to calculation stream.")
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc
new file mode 100644
index 00000000000..d0663ea42cb
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CSendOpASCENDKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_ASCEND_CL)
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    int numel = x->numel();
+    hcclDataType_t dtype = platform::ToHCCLDataType(x->type());
+
+    auto place = ctx.GetPlace();
+    int ring_id = ctx.Attr<int>("ring_id");
+    auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place);
+
+    aclrtStream stream = nullptr;
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::NPUDeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    std::string tag = ctx.Attr<std::string>("tag");
+    std::string group = std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id);
+    int destRank = ctx.Attr<int>("peer");
+    int srTag = ctx.Attr<int>("srTag");
+
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_send(
+        tag.c_str(), reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), (u64)numel, dtype, destRank,
+          srTag, group.c_str(), stream));
+    
+      VLOG(3) << "Dest rank:" << destRank << " Invoke hcom send. Sent "
+              << x->numel();
+
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with NPU."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_NPU_KERNEL(send_v2, 
+                        ops::CSendOpASCENDKernel<int>,
+                        ops::CSendOpASCENDKernel<int8_t>,
+                        ops::CSendOpASCENDKernel<float>,
+                        ops::CSendOpASCENDKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
new file mode 100644
index 00000000000..1759633d9e8
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+#include <stdio.h>
+#include "gtest/gtest.h"
+
+#include "paddle/fluid/string/printf.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(send_v2);
+USE_NO_KERNEL_OP(c_comm_init_hcom);
+USE_OP_DEVICE_KERNEL(send_v2, NPU);
+
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+
+    std::string rank_table_file = getenv("RANK_TABLE_FILE");
+    int rank_id = atoi(getenv("RANK_ID"));
+    int device_id = atoi(getenv("DEVICE_ID"));
+    int src_rank = atoi(getenv("SRC_RANK"));
+    int dest_rank = atoi(getenv("DEST_RANK"));
+    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
+
+    std::vector<int> rank_ids = {0, 1};
+    f::AttributeMap comm_init_attrs;
+    comm_init_attrs["ring_id"] = 0;
+    comm_init_attrs["nranks"] = 2;
+    comm_init_attrs["rank"] = rank_id;
+    comm_init_attrs["device_id"] = device_id;
+    comm_init_attrs["rank_ids"] = rank_ids;
+    auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+    auto place = ctx.GetPlace();
+    comm_init_op->Run(*scope, place);
+    ctx.Wait();
+}
+
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
+    std::cout<< "BEGIN TEST:"<< __FUNCTION__ <<std::endl;
+    auto x = scope->Var("X");
+    auto tensor_x = x->GetMutable<f::LoDTensor>();
+    int num = atoi(getenv("DATA_SIZE"));;
+    EXPECT_GT(num, 0);
+    EXPECT_LT(num, 1 << 15);
+    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+    int rank_id = atoi(getenv("RANK_ID"));
+    VLOG(3)<<"rank id:"<<rank_id;
+    TensorFromVector(init, ctx, tensor_x);
+    tensor_x->Resize({num, num});
+    ctx.Wait();
+    auto place = ctx.GetPlace();
+    ctx.Wait();
+
+    f::AttributeMap attrs;
+    attrs["tag"]=std::string("srtest");
+    attrs["peer"]=atoi(getenv("DEST_RANK"));
+    attrs["ring_id"]=0;
+    attrs["srTag"]=0;
+
+    auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
+    
+    op->Run(*scope, place);
+    VLOG(3)<<"send run over";
+    ctx.Wait();    
+}
+
+TEST(send_v2, NPU){
+    f::Scope scope;
+    char * npu_id=getenv("FLAGS_selected_npus");
+    VLOG(3) << "Select npu:" << npu_id;
+    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+    VLOG(3) << "Place over";
+    Prepare(&scope, ctx);
+    VLOG(3) << "Prepare over";
+    TestHcomSendOp(&scope, ctx);
+    VLOG(3) << "Test over";
+
+}
diff --git a/paddle/fluid/platform/ascend_npu_info.cc b/paddle/fluid/platform/ascend_npu_info.cc
index a6a5e4e8631..3b9353eb3ae 100644
--- a/paddle/fluid/platform/ascend_npu_info.cc
+++ b/paddle/fluid/platform/ascend_npu_info.cc
@@ -31,5 +31,3 @@ int NPUDevice::GetDeviceCount() {
 }  // namespace ascend
 }  // namespace platform
 }  // namespace paddle
-
-
diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h
index 481b23f14f1..e21919a429b 100644
--- a/paddle/fluid/platform/collective_helper.h
+++ b/paddle/fluid/platform/collective_helper.h
@@ -212,6 +212,11 @@ class HCCLCommContext {
   // Init global hcom
   HCCLCommContext() { InitHcomWorldGroup(); }
 
+public:
+  ~HCCLCommContext(){
+    PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::hcom_destroy());
+  }
+
   std::once_flag once_flag_;
   std::mutex comm_map_mutex_;
   // ring id to dev-HCCLComm
diff --git a/paddle/fluid/platform/collective_helper_npu.cc b/paddle/fluid/platform/collective_helper_npu.cc
index edfa351f19b..56cfdec4b5e 100644
--- a/paddle/fluid/platform/collective_helper_npu.cc
+++ b/paddle/fluid/platform/collective_helper_npu.cc
@@ -1,4 +1,4 @@
-//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/platform/dynload/hcom.h b/paddle/fluid/platform/dynload/hcom.h
index d63e5dfc88e..8b1778680f3 100644
--- a/paddle/fluid/platform/dynload/hcom.h
+++ b/paddle/fluid/platform/dynload/hcom.h
@@ -23,7 +23,7 @@
 #define HCOM_H_
 
 // #include <runtime/rt.h>
-#include "paddle/fluid/platform/dynload/base.h"
+#include "paddle/fluid/platform/dynload/hcom_type.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/paddle/fluid/platform/dynload/base.h b/paddle/fluid/platform/dynload/hcom_type.h
similarity index 100%
rename from paddle/fluid/platform/dynload/base.h
rename to paddle/fluid/platform/dynload/hcom_type.h
-- 
GitLab