[CodeStyle] remove crlf for cpp files (#46156)

846c7e70 · Nyakku Shigure · GitHub · c6c9c186 · 846c7e70 · 846c7e70
10 changed file
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <glog/logging.h>
-
-#include <iosfwd>
-#include <string>
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/inference/analysis/argument.h"
-#include "paddle/fluid/inference/analysis/helper.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-/*
- * AnalysisPass is a pass used to control the IR passes.
- */
-class AnalysisPass {
- public:
-  AnalysisPass() = default;
-  virtual ~AnalysisPass() = default;
-
-  // Run on a single Graph.
-  void Run(Argument* argument) { RunImpl(argument); }
-
-  // Human-readable short representation.
-  virtual std::string repr() const = 0;
-  // Human-readable long description.
-  virtual std::string description() const { return "No DOC"; }
-
- protected:
-  // User should implement these.
-  virtual void RunImpl(Argument* argument) = 0;
-};
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include <iosfwd>
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/inference/analysis/argument.h"
+#include "paddle/fluid/inference/analysis/helper.h"
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+/*
+ * AnalysisPass is a pass used to control the IR passes.
+ */
+class AnalysisPass {
+ public:
+  AnalysisPass() = default;
+  virtual ~AnalysisPass() = default;
+
+  // Run on a single Graph.
+  void Run(Argument* argument) { RunImpl(argument); }
+
+  // Human-readable short representation.
+  virtual std::string repr() const = 0;
+  // Human-readable long description.
+  virtual std::string description() const { return "No DOC"; }
+
+ protected:
+  // User should implement these.
+  virtual void RunImpl(Argument* argument) = 0;
+};
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_allgather);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allgather, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-
-  int num1 = 1;
-  int num2 = 4;
-
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-  attrs["nranks"] = 2;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_allgather", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-
-  EXPECT_EQ(out_vec.size(), init.size() * 2);
-  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
-    EXPECT_EQ(out_vec[i], 1.0);
-  }
-  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 2.0);
-  }
-}
-
-TEST(c_allgather, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLAllGatherOp(&scope, ctx);
-}
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_OP(c_allgather);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allgather, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope,
+                     const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope,
+             const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 1;
+  int num2 = 4;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp(
+      "c_allgather", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size() * 2);
+  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_allgather, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllGatherOp(&scope, ctx);
+}
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_allreduce_max);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-
-  int num1 = 100;
-  int num2 = 100;
-
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id * 3);
-  }
-  PrintDebugInfo("input data", init);
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_allreduce_max", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 4.0);
-  }
-}
-
-TEST(c_allreduce_max, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLAllReduceOp(&scope, ctx);
-}
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_OP(c_allreduce_max);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope,
+                     const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope,
+             const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  int num1 = 100;
+  int num2 = 100;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id * 3);
+  }
+  PrintDebugInfo("input data", init);
+
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp(
+      "c_allreduce_max", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 4.0);
+  }
+}
+
+TEST(c_allreduce_max, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLAllReduceOp(&scope, ctx);
+}
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
-// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
-// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
-// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
-// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_allreduce_sum);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  std::cout << preStr << ":" << std::endl << debugstring;
-  for (auto ele : data) {
-    std::cout << ele << " ";
-  }
-  std::cout << std::endl;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-template <typename T>
-void TestHCCLAllReduceOp(f::Scope* scope,
-                         const p::DeviceContext& ctx,
-                         int iter) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int num1 = 3;
-  int num2 = 128;
-
-  std::vector<T> init;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(static_cast<T>(1.0 + rank_id));
-  }
-  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
-  PrintDebugInfo("input data", init);
-
-  auto place = ctx.GetPlace();
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<T>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
-  attrs["ring_id"] = 0;
-  attrs["use_calc_stream"] = 1;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_allreduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-  for (int i = 0; i < 1; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-
-  std::vector<T> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-
-  float diff = static_cast<float>(out_vec[0]) - 65504;
-  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 1; i < 10; i++) {
-    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
-  }
-}
-
-TEST(c_allreduce_sum, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  // only support one device, if more than one device, use first default
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-
-  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
-  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
-}
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#endif
+
+// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
+// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
+// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_OP(c_allreduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  std::cout << preStr << ":" << std::endl << debugstring;
+  for (auto ele : data) {
+    std::cout << ele << " ";
+  }
+  std::cout << std::endl;
+}
+
+void PrepareUniqueId(f::Scope* scope,
+                     const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope,
+             const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+template <typename T>
+void TestHCCLAllReduceOp(f::Scope* scope,
+                         const p::DeviceContext& ctx,
+                         int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<T> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(static_cast<T>(1.0 + rank_id));
+  }
+  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<T>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+  attrs["use_calc_stream"] = 1;
+
+  auto op = f::OpRegistry::CreateOp(
+      "c_allreduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+  for (int i = 0; i < 1; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<T> out_vec;
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  float diff = static_cast<float>(out_vec[0]) - 65504;
+  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 1; i < 10; i++) {
+    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
+  }
+}
+
+TEST(c_allreduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  // only support one device, if more than one device, use first default
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+
+  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
+  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
+}
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_broadcast);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-  int num = 2;
-  std::vector<float> init;
-  int rank_id = atoi(getenv("RANK_ID"));
-
-  for (int64_t i = 0; i < num * num; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num, num});
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  tensor_out->Resize({num, num});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["root"] = 0;
-  attrs["ring_id"] = 0;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  for (int i = 0; i < 10; i++) {
-    op->Run(*scope, place);
-  }
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 1.0);
-  }
-}
-
-TEST(c_broadcast, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLBroadcastOp(&scope, ctx);
-}
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_OP(c_broadcast);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope,
+                     const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope,
+             const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = 2;
+  std::vector<float> init;
+  int rank_id = atoi(getenv("RANK_ID"));
+
+  for (int64_t i = 0; i < num * num; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
+
+  auto op = f::OpRegistry::CreateOp(
+      "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 1.0);
+  }
+}
+
+TEST(c_broadcast, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLBroadcastOp(&scope, ctx);
+}
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_reduce_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_reduce_sum);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(3) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int num1 = 3;
-  int num2 = 128;
-
-  std::vector<float> init;
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0 + rank_id);
-  }
-  PrintDebugInfo("input data", init);
-
-  auto place = ctx.GetPlace();
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-  ctx.Wait();
-
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
-  attrs["ring_id"] = 0;
-  int root_id = 0;
-  attrs["root_id"] = root_id;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_reduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  op->Run(*scope, place);
-  ctx.Wait();
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-
-  EXPECT_EQ(out_vec.size(), init.size());
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    if (rank_id == root_id) {
-      EXPECT_EQ(out_vec[i], 3.0);
-    } else {
-      EXPECT_EQ(out_vec[i], init[i]);
-    }
-  }
-}
-
-TEST(c_reduce_sum, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  for (int i = 0; i < 2; i++) {
-    VLOG(2) << "iter num: " << i;
-    TestHCCLReduceOp(&scope, ctx, i);
-  }
-}
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/collective/c_reduce_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_OP(c_reduce_sum);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope,
+                     const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope,
+             const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int num1 = 3;
+  int num2 = 128;
+
+  std::vector<float> init;
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0 + rank_id);
+  }
+  PrintDebugInfo("input data", init);
+
+  auto place = ctx.GetPlace();
+
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+  ctx.Wait();
+
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
+  int root_id = 0;
+  attrs["root_id"] = root_id;
+
+  auto op = f::OpRegistry::CreateOp(
+      "c_reduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+
+  op->Run(*scope, place);
+  ctx.Wait();
+
+  std::vector<float> out_vec;
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+
+  EXPECT_EQ(out_vec.size(), init.size());
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    if (rank_id == root_id) {
+      EXPECT_EQ(out_vec[i], 3.0);
+    } else {
+      EXPECT_EQ(out_vec[i], init[i]);
+    }
+  }
+}
+
+TEST(c_reduce_sum, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  for (int i = 0; i < 2; i++) {
+    VLOG(2) << "iter num: " << i;
+    TestHCCLReduceOp(&scope, ctx, i);
+  }
+}
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
-#include <stdio.h>
-
-#include <string>
-#include <thread>  // NOLINT
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/collective/c_allgather_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
-#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#if defined(PADDLE_WITH_ASCEND_CL)
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/npu/hccl_helper.h"
-#endif
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-USE_OP(c_reducescatter);
-USE_NO_KERNEL_OP(c_gen_hccl_id);
-USE_NO_KERNEL_OP(c_comm_init_hccl);
-USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
-
-DECLARE_string(selected_npus);
-
-template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
-  std::string debugstring = "";
-  for (auto ele : data) {
-    debugstring += std::to_string(ele) + std::string(",");
-  }
-  VLOG(2) << preStr << ":" << std::endl << debugstring;
-}
-
-void PrepareUniqueId(f::Scope* scope,
-                     const p::DeviceContext& ctx,
-                     HcclRootInfo* hccl_id) {
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  std::vector<int> rank_ids{0, 1};
-  f::AttributeMap gen_hccl_id;
-
-  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
-  gen_hccl_id["rank"] = rank_id;
-  gen_hccl_id["endpoint"] = endpointList[rank_id];
-  std::vector<std::string> other_endpoints = {
-      endpointList[rank_id == 0 ? 1 : 0]};
-  gen_hccl_id["other_endpoints"] = other_endpoints;
-
-  auto out = scope->Var("Out");
-  auto id = out->GetMutable<HcclRootInfo>();
-
-  VLOG(3) << "break";
-
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
-  VLOG(3) << "break";
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-
-  memcpy(hccl_id, id, 1024);
-}
-
-void Prepare(f::Scope* scope,
-             const p::DeviceContext& ctx,
-             HcclRootInfo* hccl_id) {
-  auto x = scope->Var("X");
-  auto id = x->GetMutable<HcclRootInfo>();
-
-  memcpy(id, hccl_id, 1024);
-
-  int rank_id = atoi(getenv("RANK_ID"));
-  int device_id = atoi(getenv("DEVICE_ID"));
-
-  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-          << "; rank_id = " << rank_id
-          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
-
-  // std::vector<int> rank_ids{0, 1};
-  f::AttributeMap comm_init_attrs;
-  comm_init_attrs["ring_id"] = 0;
-  comm_init_attrs["rank_ids"] = 2;
-  comm_init_attrs["rank"] = rank_id;
-  comm_init_attrs["device_id"] = device_id;
-  // comm_init_attrs["rank_ids"] = rank_ids;
-  auto comm_init_op = f::OpRegistry::CreateOp(
-      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
-  auto place = ctx.GetPlace();
-  comm_init_op->Run(*scope, place);
-  ctx.Wait();
-}
-
-void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
-  // init
-  auto x = scope->Var("Data");
-  auto tensor_x = x->GetMutable<f::LoDTensor>();
-
-  std::vector<float> init;
-  int num1 = 4;
-  int num2 = 1;
-
-  for (int64_t i = 0; i < num1 * num2; ++i) {
-    init.push_back(1.0);
-  }
-  PrintDebugInfo("input data", init);
-
-  paddle::framework::TensorFromVector(init, ctx, tensor_x);
-  tensor_x->Resize({num1, num2});
-
-  ctx.Wait();
-
-  auto place = ctx.GetPlace();
-  auto out = scope->Var("OutData");
-  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  tensor_out->Resize({num1, num2});
-  tensor_out->mutable_data<float>(place);  // allocate
-
-  ctx.Wait();
-
-  // run
-  f::AttributeMap attrs;
-  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"] = 0;
-  attrs["nranks"] = 2;
-
-  auto op = f::OpRegistry::CreateOp(
-      "c_reducescatter", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
-
-  int iter_num = 10;
-  for (int i = 0; i < iter_num; i++) {
-    op->Run(*scope, place);
-    ctx.Wait();
-  }
-
-  std::vector<float> out_vec;
-  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
-  ctx.Wait();
-
-  PrintDebugInfo("output data", out_vec);
-  EXPECT_EQ(out_vec.size(), init.size() / 2);
-  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    EXPECT_EQ(out_vec[i], 2.0);
-  }
-}
-
-TEST(c_reducescatter, NPU) {
-  f::Scope scope;
-  HcclRootInfo hccl_id;
-
-  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-
-  PrepareUniqueId(&scope, ctx, &hccl_id);
-  Prepare(&scope, ctx, &hccl_id);
-  TestHCCLReduceScatterOp(&scope, ctx);
-}
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
+#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
+#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
+#include "paddle/fluid/string/printf.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#if defined(PADDLE_WITH_ASCEND_CL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/npu/hccl_helper.h"
+#endif
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+USE_OP(c_reducescatter);
+USE_NO_KERNEL_OP(c_gen_hccl_id);
+USE_NO_KERNEL_OP(c_comm_init_hccl);
+USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
+
+DECLARE_string(selected_npus);
+
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
+  std::string debugstring = "";
+  for (auto ele : data) {
+    debugstring += std::to_string(ele) + std::string(",");
+  }
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
+}
+
+void PrepareUniqueId(f::Scope* scope,
+                     const p::DeviceContext& ctx,
+                     HcclRootInfo* hccl_id) {
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  std::vector<int> rank_ids{0, 1};
+  f::AttributeMap gen_hccl_id;
+
+  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
+  gen_hccl_id["rank"] = rank_id;
+  gen_hccl_id["endpoint"] = endpointList[rank_id];
+  std::vector<std::string> other_endpoints = {
+      endpointList[rank_id == 0 ? 1 : 0]};
+  gen_hccl_id["other_endpoints"] = other_endpoints;
+
+  auto out = scope->Var("Out");
+  auto id = out->GetMutable<HcclRootInfo>();
+
+  VLOG(3) << "break";
+
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
+  VLOG(3) << "break";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+
+  memcpy(hccl_id, id, 1024);
+}
+
+void Prepare(f::Scope* scope,
+             const p::DeviceContext& ctx,
+             HcclRootInfo* hccl_id) {
+  auto x = scope->Var("X");
+  auto id = x->GetMutable<HcclRootInfo>();
+
+  memcpy(id, hccl_id, 1024);
+
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
+  // std::vector<int> rank_ids{0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["rank_ids"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  // comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op = f::OpRegistry::CreateOp(
+      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
+}
+
+void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  // init
+  auto x = scope->Var("Data");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<float> init;
+  int num1 = 4;
+  int num2 = 1;
+
+  for (int64_t i = 0; i < num1 * num2; ++i) {
+    init.push_back(1.0);
+  }
+  PrintDebugInfo("input data", init);
+
+  paddle::framework::TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num1, num2});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("OutData");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num1, num2});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  // run
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
+  auto op = f::OpRegistry::CreateOp(
+      "c_reducescatter", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
+
+  int iter_num = 10;
+  for (int i = 0; i < iter_num; i++) {
+    op->Run(*scope, place);
+    ctx.Wait();
+  }
+
+  std::vector<float> out_vec;
+  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+
+  PrintDebugInfo("output data", out_vec);
+  EXPECT_EQ(out_vec.size(), init.size() / 2);
+  for (uint32_t i = 0; i < out_vec.size(); i++) {
+    EXPECT_EQ(out_vec[i], 2.0);
+  }
+}
+
+TEST(c_reducescatter, NPU) {
+  f::Scope scope;
+  HcclRootInfo hccl_id;
+
+  // only support one device, if more than one device, use first default
+  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+
+  PrepareUniqueId(&scope, ctx, &hccl_id);
+  Prepare(&scope, ctx, &hccl_id);
+  TestHCCLReduceScatterOp(&scope, ctx);
+}
--- a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-class FillConstantMKLDNNHandler
-    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
- public:
-  FillConstantMKLDNNHandler(Tensor* out,
-                            dnnl::engine engine,
-                            platform::Place cpu_place)
-      : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    const auto src0_md =
-        dnnl::memory::desc({out->numel(), sizeof(T)},
-                           platform::MKLDNNGetDataType<uint8_t>(),
-                           dnnl::memory::format_tag::ab);
-
-    dnnl::primitive_attr attrs;
-    attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f});
-
-    this->AcquireForwardPrimitiveDescriptor(
-        attrs, dnnl::algorithm::binary_add, src0_md, src1_md, src0_md);
-  }
-
-  static const dnnl::memory::desc src1_md;
-};
-
-template <typename T>
-const dnnl::memory::desc FillConstantMKLDNNHandler<T>::src1_md(
-    {1, sizeof(T)},
-    platform::MKLDNNGetDataType<uint8_t>(),
-    dnnl::memory::format_tag::ab);
-
-template <typename T>
-class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& dnnl_engine = dev_ctx.GetEngine();
-
-    auto* out = ctx.Output<Tensor>("Out");
-    T fill_value = CalculateFillValue(ctx);
-
-    auto shape = GetShape(ctx);
-    out->Resize(shape);
-
-    FillConstantMKLDNNHandler<T> handler(out, dnnl_engine, ctx.GetPlace());
-
-    dnnl::memory constant_value_memory =
-        dnnl::memory(FillConstantMKLDNNHandler<T>::src1_md,
-                     dnnl_engine,
-                     reinterpret_cast<uint8_t*>(&fill_value));
-
-    auto src0_memory_p = handler.AcquireDstMemory(out);
-    auto fill_constant_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    fill_constant_p->execute(astream,
-                             {{DNNL_ARG_SRC_0, *src0_memory_p},
-                              {DNNL_ARG_SRC_1, constant_value_memory},
-                              {DNNL_ARG_DST, *src0_memory_p}});
-    astream.wait();
-
-    // src0_memory_p's md was just to allow the usage of a binary
-    // primitive as a memset, and now we need to create a real one
-    out->set_mem_desc({phi::vectorize(shape),
-                       platform::MKLDNNGetDataType<T>(),
-                       platform::GetPlainMKLDNNFormat(shape.size())});
-  }
-
-  T CalculateFillValue(const framework::ExecutionContext& ctx) const {
-    const auto str_value = ctx.Attr<std::string>("str_value");
-    const auto float_value = ctx.Attr<float>("value");
-
-    T value;
-
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<float>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<float>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        double tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
-      }
-    }
-
-    if (ctx.HasInput("ValueTensor")) {
-      const auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
-      PADDLE_ENFORCE_EQ(
-          value_tensor->numel(),
-          1,
-          platform::errors::InvalidArgument(
-              "When use Tensor as value to set Tensor value in fill_constant, "
-              "value input(ValueTensor) size must be 1, but got %d",
-              value_tensor->numel()));
-      value = value_tensor->data<T>()[0];
-    }
-
-    return value;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(fill_constant,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::FillConstantMKLDNNKernel<float>);
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+class FillConstantMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
+ public:
+  FillConstantMKLDNNHandler(Tensor* out,
+                            dnnl::engine engine,
+                            platform::Place cpu_place)
+      : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
+    const auto src0_md =
+        dnnl::memory::desc({out->numel(), sizeof(T)},
+                           platform::MKLDNNGetDataType<uint8_t>(),
+                           dnnl::memory::format_tag::ab);
+
+    dnnl::primitive_attr attrs;
+    attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f});
+
+    this->AcquireForwardPrimitiveDescriptor(
+        attrs, dnnl::algorithm::binary_add, src0_md, src1_md, src0_md);
+  }
+
+  static const dnnl::memory::desc src1_md;
+};
+
+template <typename T>
+const dnnl::memory::desc FillConstantMKLDNNHandler<T>::src1_md(
+    {1, sizeof(T)},
+    platform::MKLDNNGetDataType<uint8_t>(),
+    dnnl::memory::format_tag::ab);
+
+template <typename T>
+class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    this->RunKernel(ctx);
+  }
+
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& dnnl_engine = dev_ctx.GetEngine();
+
+    auto* out = ctx.Output<Tensor>("Out");
+    T fill_value = CalculateFillValue(ctx);
+
+    auto shape = GetShape(ctx);
+    out->Resize(shape);
+
+    FillConstantMKLDNNHandler<T> handler(out, dnnl_engine, ctx.GetPlace());
+
+    dnnl::memory constant_value_memory =
+        dnnl::memory(FillConstantMKLDNNHandler<T>::src1_md,
+                     dnnl_engine,
+                     reinterpret_cast<uint8_t*>(&fill_value));
+
+    auto src0_memory_p = handler.AcquireDstMemory(out);
+    auto fill_constant_p = handler.AcquireForwardPrimitive();
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    fill_constant_p->execute(astream,
+                             {{DNNL_ARG_SRC_0, *src0_memory_p},
+                              {DNNL_ARG_SRC_1, constant_value_memory},
+                              {DNNL_ARG_DST, *src0_memory_p}});
+    astream.wait();
+
+    // src0_memory_p's md was just to allow the usage of a binary
+    // primitive as a memset, and now we need to create a real one
+    out->set_mem_desc({phi::vectorize(shape),
+                       platform::MKLDNNGetDataType<T>(),
+                       platform::GetPlainMKLDNNFormat(shape.size())});
+  }
+
+  T CalculateFillValue(const framework::ExecutionContext& ctx) const {
+    const auto str_value = ctx.Attr<std::string>("str_value");
+    const auto float_value = ctx.Attr<float>("value");
+
+    T value;
+
+    if (str_value.empty()) {
+      value = static_cast<T>(float_value);
+    } else {
+      // handle NaN/Inf first, which cannot be read from stream
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<float>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<float>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
+      } else {
+        std::stringstream convert_stream(str_value);
+        double tmp_value;
+        convert_stream >> tmp_value;
+        value = static_cast<T>(tmp_value);
+      }
+    }
+
+    if (ctx.HasInput("ValueTensor")) {
+      const auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
+      PADDLE_ENFORCE_EQ(
+          value_tensor->numel(),
+          1,
+          platform::errors::InvalidArgument(
+              "When use Tensor as value to set Tensor value in fill_constant, "
+              "value input(ValueTensor) size must be 1, but got %d",
+              value_tensor->numel()));
+      value = value_tensor->data<T>()[0];
+    }
+
+    return value;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(fill_constant,
+                   MKLDNN,
+                   paddle::platform::CPUPlace,
+                   ops::FillConstantMKLDNNKernel<float>);
--- a/paddle/fluid/operators/unbind_op.cc
+++ b/paddle/fluid/operators/unbind_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/unbind_op.h"
-
-#include <string>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-using framework::Tensor;
-
-class UnbindOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        ctx->HasInput("X"),
-        true,
-        platform::errors::NotFound("Input(X) of UnbindOp is not found."));
-    PADDLE_ENFORCE_GE(
-        ctx->Outputs("Out").size(),
-        1UL,
-        platform::errors::NotFound("Outputs(Out) of UnbindOp is not found."));
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    int axis = ctx->Attrs().Get<int>("axis");
-    const size_t outs_number = outs_names.size();
-    auto out_dims = UnbindOutsDims(in_dims, axis);
-    std::vector<framework::DDim> outs_dims(outs_number, out_dims);
-    ctx->SetOutputsDim("Out", outs_dims);
-    for (size_t i = 0; i < outs_number; ++i) {
-      ctx->ShareLoD("X", "Out", 0, i);
-    }
-  }
-};
-
-class UnbindOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor) Input tensor of the split operator.");
-    AddOutput("Out", "(Tensor) Output tensors of the unbind operator.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-Unbind operator
-
-Remove a tensor dimension.
-
-Example:
-  Input = [[1,2],
-           [3,4],
-           [5,6]]
-  axis = 0
-  Output[0] = [1,2]
-  Output[1] = [3,4]
-  Output[2] = [5,6]
-
-    )DOC");
-    AddAttr<int>("axis",
-                 "(int, default 0) "
-                 "dimension to remove.")
-        .SetDefault(0);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(unbind,
-                  ops::UnbindOp,
-                  ops::UnbindOpMaker,
-                  ops::UnbindGradMaker<paddle::framework::OpDesc>,
-                  ops::UnbindGradMaker<paddle::imperative::OpBase>);
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/unbind_op.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class UnbindOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("X"),
+        true,
+        platform::errors::NotFound("Input(X) of UnbindOp is not found."));
+    PADDLE_ENFORCE_GE(
+        ctx->Outputs("Out").size(),
+        1UL,
+        platform::errors::NotFound("Outputs(Out) of UnbindOp is not found."));
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    int axis = ctx->Attrs().Get<int>("axis");
+    const size_t outs_number = outs_names.size();
+    auto out_dims = UnbindOutsDims(in_dims, axis);
+    std::vector<framework::DDim> outs_dims(outs_number, out_dims);
+    ctx->SetOutputsDim("Out", outs_dims);
+    for (size_t i = 0; i < outs_number; ++i) {
+      ctx->ShareLoD("X", "Out", 0, i);
+    }
+  }
+};
+
+class UnbindOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the unbind operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+Unbind operator
+
+Remove a tensor dimension.
+
+Example:
+  Input = [[1,2],
+           [3,4],
+           [5,6]]
+  axis = 0
+  Output[0] = [1,2]
+  Output[1] = [3,4]
+  Output[2] = [5,6]
+
+    )DOC");
+    AddAttr<int>("axis",
+                 "(int, default 0) "
+                 "dimension to remove.")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(unbind,
+                  ops::UnbindOp,
+                  ops::UnbindOpMaker,
+                  ops::UnbindGradMaker<paddle::framework::OpDesc>,
+                  ops::UnbindGradMaker<paddle::imperative::OpBase>);
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <chrono>  // NOLINT
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/strided_memcpy.h"
-#include "paddle/fluid/operators/utils.h"
-
-namespace paddle {
-namespace operators {
-static inline framework::DDim UnbindOutsDims(const framework::DDim in_dims,
-                                             int axis) {
-  std::vector<int> out_dims;
-  axis = axis < 0 ? in_dims.size() + axis : axis;
-  for (int i = 0; i < in_dims.size(); i++) {
-    if (i != axis) out_dims.push_back(in_dims[i]);
-  }
-  return phi::make_ddim(out_dims);
-}
-
-template <typename T>
-class UnbindGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> op) const override {
-    op->SetType("stack");
-    op->SetInput("X", this->OutputGrad("Out"));
-    op->SetOutput("Y", this->InputGrad("X"));
-    op->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <chrono>  // NOLINT
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/fluid/operators/utils.h"
+
+namespace paddle {
+namespace operators {
+static inline framework::DDim UnbindOutsDims(const framework::DDim in_dims,
+                                             int axis) {
+  std::vector<int> out_dims;
+  axis = axis < 0 ? in_dims.size() + axis : axis;
+  for (int i = 0; i < in_dims.size(); i++) {
+    if (i != axis) out_dims.push_back(in_dims[i]);
+  }
+  return phi::make_ddim(out_dims);
+}
+
+template <typename T>
+class UnbindGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("stack");
+    op->SetInput("X", this->OutputGrad("Out"));
+    op->SetOutput("Y", this->InputGrad("X"));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle