[CodeStyle] remove crlf for cpp files (#46156)

846c7e70 · Nyakku Shigure · GitHub · c6c9c186 · 846c7e70 · 846c7e70
10 changed file
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <glog/logging.h>
 #include <iosfwd>
 #include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 namespace paddle {
 namespace inference {
 namespace analysis {
 /*
 * AnalysisPass is a pass used to control the IR passes.
 */
 class AnalysisPass {
 public:
  AnalysisPass() = default;
  virtual ~AnalysisPass() = default;
  // Run on a single Graph.
  void Run(Argument* argument) { RunImpl(argument); }
  // Human-readable short representation.
  virtual std::string repr() const = 0;
  // Human-readable long description.
  virtual std::string description() const { return "No DOC"; }
 protected:
  // User should implement these.
  virtual void RunImpl(Argument* argument) = 0;
 };
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 USE_OP(c_allgather);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
 USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allgather, NPU);
 DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
 void PrepareUniqueId(f::Scope* scope,
                     const p::DeviceContext& ctx,
                     HcclRootInfo* hccl_id) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
  f::AttributeMap gen_hccl_id;
  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
  gen_hccl_id["rank"] = rank_id;
  gen_hccl_id["endpoint"] = endpointList[rank_id];
  std::vector<std::string> other_endpoints = {
      endpointList[rank_id == 0 ? 1 : 0]};
  gen_hccl_id["other_endpoints"] = other_endpoints;
  auto out = scope->Var("Out");
  auto id = out->GetMutable<HcclRootInfo>();
  VLOG(3) << "break";
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
  memcpy(hccl_id, id, 1024);
 }
 void Prepare(f::Scope* scope,
             const p::DeviceContext& ctx,
             HcclRootInfo* hccl_id) {
  auto x = scope->Var("X");
  auto id = x->GetMutable<HcclRootInfo>();
  memcpy(id, hccl_id, 1024);
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
 void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  std::vector<float> init;
  int rank_id = atoi(getenv("RANK_ID"));
  int num1 = 1;
  int num2 = 4;
  for (int64_t i = 0; i < num1 * num2; ++i) {
    init.push_back(1.0 + rank_id);
  }
  PrintDebugInfo("input data", init);
  paddle::framework::TensorFromVector(init, ctx, tensor_x);
  tensor_x->Resize({num1, num2});
  ctx.Wait();
  auto place = ctx.GetPlace();
  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
  ctx.Wait();
  // run
  f::AttributeMap attrs;
  attrs["tag"] = std::string("tagx");
  attrs["ring_id"] = 0;
  attrs["nranks"] = 2;
  auto op = f::OpRegistry::CreateOp(
      "c_allgather", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
  std::vector<float> out_vec;
  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
  PrintDebugInfo("output data", out_vec);
  EXPECT_EQ(out_vec.size(), init.size() * 2);
  for (uint32_t i = 0; i < out_vec.size() / 2; i++) {
    EXPECT_EQ(out_vec[i], 1.0);
  }
  for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) {
    EXPECT_EQ(out_vec[i], 2.0);
  }
 }
 TEST(c_allgather, NPU) {
  f::Scope scope;
  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
  PrepareUniqueId(&scope, ctx, &hccl_id);
  Prepare(&scope, ctx, &hccl_id);
  TestHCCLAllGatherOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 USE_OP(c_allreduce_max);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
 USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
 DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
 void PrepareUniqueId(f::Scope* scope,
                     const p::DeviceContext& ctx,
                     HcclRootInfo* hccl_id) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
  f::AttributeMap gen_hccl_id;
  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
  gen_hccl_id["rank"] = rank_id;
  gen_hccl_id["endpoint"] = endpointList[rank_id];
  std::vector<std::string> other_endpoints = {
      endpointList[rank_id == 0 ? 1 : 0]};
  gen_hccl_id["other_endpoints"] = other_endpoints;
  auto out = scope->Var("Out");
  auto id = out->GetMutable<HcclRootInfo>();
  VLOG(3) << "break";
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
  memcpy(hccl_id, id, 1024);
 }
 void Prepare(f::Scope* scope,
             const p::DeviceContext& ctx,
             HcclRootInfo* hccl_id) {
  auto x = scope->Var("X");
  auto id = x->GetMutable<HcclRootInfo>();
  memcpy(id, hccl_id, 1024);
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
 void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  std::vector<float> init;
  int rank_id = atoi(getenv("RANK_ID"));
  int num1 = 100;
  int num2 = 100;
  for (int64_t i = 0; i < num1 * num2; ++i) {
    init.push_back(1.0 + rank_id * 3);
  }
  PrintDebugInfo("input data", init);
  paddle::framework::TensorFromVector(init, ctx, tensor_x);
  tensor_x->Resize({num1, num2});
  ctx.Wait();
  auto place = ctx.GetPlace();
  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
  ctx.Wait();
  // run
  f::AttributeMap attrs;
  attrs["tag"] = std::string("tagx");
  attrs["ring_id"] = 0;
  auto op = f::OpRegistry::CreateOp(
      "c_allreduce_max", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
  std::vector<float> out_vec;
  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
  PrintDebugInfo("output data", out_vec);
  EXPECT_EQ(out_vec.size(), init.size());
  for (uint32_t i = 0; i < out_vec.size(); i++) {
    EXPECT_EQ(out_vec[i], 4.0);
  }
 }
 TEST(c_allreduce_max, NPU) {
  f::Scope scope;
  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
  PrepareUniqueId(&scope, ctx, &hccl_id);
  Prepare(&scope, ctx, &hccl_id);
  TestHCCLAllReduceOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 // Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
 // DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
 // Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
 // DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 USE_OP(c_allreduce_sum);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
 USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
 DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  std::cout << preStr << ":" << std::endl << debugstring;
  for (auto ele : data) {
    std::cout << ele << " ";
  }
  std::cout << std::endl;
 }
 void PrepareUniqueId(f::Scope* scope,
                     const p::DeviceContext& ctx,
                     HcclRootInfo* hccl_id) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
  f::AttributeMap gen_hccl_id;
  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
  gen_hccl_id["rank"] = rank_id;
  gen_hccl_id["endpoint"] = endpointList[rank_id];
  std::vector<std::string> other_endpoints = {
      endpointList[rank_id == 0 ? 1 : 0]};
  gen_hccl_id["other_endpoints"] = other_endpoints;
  auto out = scope->Var("Out");
  auto id = out->GetMutable<HcclRootInfo>();
  VLOG(3) << "break";
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
  memcpy(hccl_id, id, 1024);
 }
 void Prepare(f::Scope* scope,
             const p::DeviceContext& ctx,
             HcclRootInfo* hccl_id) {
  auto x = scope->Var("X");
  auto id = x->GetMutable<HcclRootInfo>();
  memcpy(id, hccl_id, 1024);
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
 template <typename T>
 void TestHCCLAllReduceOp(f::Scope* scope,
                         const p::DeviceContext& ctx,
                         int iter) {
  // init
  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  int rank_id = atoi(getenv("RANK_ID"));
  int num1 = 3;
  int num2 = 128;
  std::vector<T> init;
  for (int64_t i = 0; i < num1 * num2; ++i) {
    init.push_back(static_cast<T>(1.0 + rank_id));
  }
  init[0] = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
  PrintDebugInfo("input data", init);
  auto place = ctx.GetPlace();
  paddle::framework::TensorFromVector(init, ctx, tensor_x);
  tensor_x->Resize({num1, num2});
  ctx.Wait();
  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<T>(place);  // allocate
  ctx.Wait();
  // run
  f::AttributeMap attrs;
  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
  attrs["ring_id"] = 0;
  attrs["use_calc_stream"] = 1;
  auto op = f::OpRegistry::CreateOp(
      "c_allreduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
  for (int i = 0; i < 1; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
  std::vector<T> out_vec;
  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
  PrintDebugInfo("output data", out_vec);
  float diff = static_cast<float>(out_vec[0]) - 65504;
  EXPECT_TRUE(diff < 0.1 && diff > -0.1);
  EXPECT_EQ(out_vec.size(), init.size());
  for (uint32_t i = 1; i < 10; i++) {
    EXPECT_EQ(out_vec[i], static_cast<paddle::platform::float16>(3.0));
  }
 }
 TEST(c_allreduce_sum, NPU) {
  f::Scope scope;
  HcclRootInfo hccl_id;
  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
  // only support one device, if more than one device, use first default
  PrepareUniqueId(&scope, ctx, &hccl_id);
  Prepare(&scope, ctx, &hccl_id);
  TestHCCLAllReduceOp<paddle::platform::float16>(&scope, ctx, 1);
  // TestHCCLAllReduceOp<float>(&scope, ctx, 0);
 }
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 USE_OP(c_broadcast);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
 USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
 DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
 void PrepareUniqueId(f::Scope* scope,
                     const p::DeviceContext& ctx,
                     HcclRootInfo* hccl_id) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
  f::AttributeMap gen_hccl_id;
  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
  gen_hccl_id["rank"] = rank_id;
  gen_hccl_id["endpoint"] = endpointList[rank_id];
  std::vector<std::string> other_endpoints = {
      endpointList[rank_id == 0 ? 1 : 0]};
  gen_hccl_id["other_endpoints"] = other_endpoints;
  auto out = scope->Var("Out");
  auto id = out->GetMutable<HcclRootInfo>();
  VLOG(3) << "break";
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
  memcpy(hccl_id, id, 1024);
 }
 void Prepare(f::Scope* scope,
             const p::DeviceContext& ctx,
             HcclRootInfo* hccl_id) {
  auto x = scope->Var("X");
  auto id = x->GetMutable<HcclRootInfo>();
  memcpy(id, hccl_id, 1024);
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
 void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  int num = 2;
  std::vector<float> init;
  int rank_id = atoi(getenv("RANK_ID"));
  for (int64_t i = 0; i < num * num; ++i) {
    init.push_back(1.0 + rank_id);
  }
  PrintDebugInfo("input data", init);
  paddle::framework::TensorFromVector(init, ctx, tensor_x);
  tensor_x->Resize({num, num});
  ctx.Wait();
  auto place = ctx.GetPlace();
  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num, num});
  tensor_out->mutable_data<float>(place);  // allocate
  ctx.Wait();
  // run
  f::AttributeMap attrs;
  attrs["tag"] = std::string("tagx");
  attrs["root"] = 0;
  attrs["ring_id"] = 0;
  auto op = f::OpRegistry::CreateOp(
      "c_broadcast", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
  std::vector<float> out_vec;
  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
  PrintDebugInfo("output data", out_vec);
  EXPECT_EQ(out_vec.size(), init.size());
  for (uint32_t i = 0; i < out_vec.size(); i++) {
    EXPECT_EQ(out_vec[i], 1.0);
  }
 }
 TEST(c_broadcast, NPU) {
  f::Scope scope;
  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
  PrepareUniqueId(&scope, ctx, &hccl_id);
  Prepare(&scope, ctx, &hccl_id);
  TestHCCLBroadcastOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 USE_OP(c_reduce_sum);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
 USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
 DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }
 void PrepareUniqueId(f::Scope* scope,
                     const p::DeviceContext& ctx,
                     HcclRootInfo* hccl_id) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
  f::AttributeMap gen_hccl_id;
  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
  gen_hccl_id["rank"] = rank_id;
  gen_hccl_id["endpoint"] = endpointList[rank_id];
  std::vector<std::string> other_endpoints = {
      endpointList[rank_id == 0 ? 1 : 0]};
  gen_hccl_id["other_endpoints"] = other_endpoints;
  auto out = scope->Var("Out");
  auto id = out->GetMutable<HcclRootInfo>();
  VLOG(3) << "break";
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
  memcpy(hccl_id, id, 1024);
 }
 void Prepare(f::Scope* scope,
             const p::DeviceContext& ctx,
             HcclRootInfo* hccl_id) {
  auto x = scope->Var("X");
  auto id = x->GetMutable<HcclRootInfo>();
  memcpy(id, hccl_id, 1024);
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
 void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  // init
  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  int rank_id = atoi(getenv("RANK_ID"));
  int num1 = 3;
  int num2 = 128;
  std::vector<float> init;
  for (int64_t i = 0; i < num1 * num2; ++i) {
    init.push_back(1.0 + rank_id);
  }
  PrintDebugInfo("input data", init);
  auto place = ctx.GetPlace();
  paddle::framework::TensorFromVector(init, ctx, tensor_x);
  tensor_x->Resize({num1, num2});
  ctx.Wait();
  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
  ctx.Wait();
  // run
  f::AttributeMap attrs;
  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
  attrs["ring_id"] = 0;
  int root_id = 0;
  attrs["root_id"] = root_id;
  auto op = f::OpRegistry::CreateOp(
      "c_reduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
  op->Run(*scope, place);
  ctx.Wait();
  std::vector<float> out_vec;
  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
  PrintDebugInfo("output data", out_vec);
  EXPECT_EQ(out_vec.size(), init.size());
  for (uint32_t i = 0; i < out_vec.size(); i++) {
    if (rank_id == root_id) {
      EXPECT_EQ(out_vec[i], 3.0);
    } else {
      EXPECT_EQ(out_vec[i], init[i]);
    }
  }
 }
 TEST(c_reduce_sum, NPU) {
  f::Scope scope;
  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
  PrepareUniqueId(&scope, ctx, &hccl_id);
  Prepare(&scope, ctx, &hccl_id);
  for (int i = 0; i < 2; i++) {
    VLOG(2) << "iter num: " << i;
    TestHCCLReduceOp(&scope, ctx, i);
  }
 }
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
 /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef _WIN32
 #include <unistd.h>
 #endif
 #include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/string/printf.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/npu/hccl_helper.h"
 #endif
 namespace f = paddle::framework;
 namespace p = paddle::platform;
 USE_OP(c_reducescatter);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
 USE_NO_KERNEL_OP(c_comm_init_hccl);
 USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
 DECLARE_string(selected_npus);
 template <typename T>
 void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
 void PrepareUniqueId(f::Scope* scope,
                     const p::DeviceContext& ctx,
                     HcclRootInfo* hccl_id) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  std::vector<int> rank_ids{0, 1};
  f::AttributeMap gen_hccl_id;
  std::vector<std::string> endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"};
  gen_hccl_id["rank"] = rank_id;
  gen_hccl_id["endpoint"] = endpointList[rank_id];
  std::vector<std::string> other_endpoints = {
      endpointList[rank_id == 0 ? 1 : 0]};
  gen_hccl_id["other_endpoints"] = other_endpoints;
  auto out = scope->Var("Out");
  auto id = out->GetMutable<HcclRootInfo>();
  VLOG(3) << "break";
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_gen_hccl_id", {}, {{"Out", {"Out"}}}, gen_hccl_id);
  VLOG(3) << "break";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
  memcpy(hccl_id, id, 1024);
 }
 void Prepare(f::Scope* scope,
             const p::DeviceContext& ctx,
             HcclRootInfo* hccl_id) {
  auto x = scope->Var("X");
  auto id = x->GetMutable<HcclRootInfo>();
  memcpy(id, hccl_id, 1024);
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
  // std::vector<int> rank_ids{0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
  comm_init_attrs["rank_ids"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
  // comm_init_attrs["rank_ids"] = rank_ids;
  auto comm_init_op = f::OpRegistry::CreateOp(
      "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
 void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
  auto x = scope->Var("Data");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
  std::vector<float> init;
  int num1 = 4;
  int num2 = 1;
  for (int64_t i = 0; i < num1 * num2; ++i) {
    init.push_back(1.0);
  }
  PrintDebugInfo("input data", init);
  paddle::framework::TensorFromVector(init, ctx, tensor_x);
  tensor_x->Resize({num1, num2});
  ctx.Wait();
  auto place = ctx.GetPlace();
  auto out = scope->Var("OutData");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  tensor_out->Resize({num1, num2});
  tensor_out->mutable_data<float>(place);  // allocate
  ctx.Wait();
  // run
  f::AttributeMap attrs;
  attrs["tag"] = std::string("tagx");
  attrs["ring_id"] = 0;
  attrs["nranks"] = 2;
  auto op = f::OpRegistry::CreateOp(
      "c_reducescatter", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs);
  int iter_num = 10;
  for (int i = 0; i < iter_num; i++) {
    op->Run(*scope, place);
    ctx.Wait();
  }
  std::vector<float> out_vec;
  paddle::framework::TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
  PrintDebugInfo("output data", out_vec);
  EXPECT_EQ(out_vec.size(), init.size() / 2);
  for (uint32_t i = 0; i < out_vec.size(); i++) {
    EXPECT_EQ(out_vec[i], 2.0);
  }
 }
 TEST(c_reducescatter, NPU) {
  f::Scope scope;
  HcclRootInfo hccl_id;
  // only support one device, if more than one device, use first default
  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
  PrepareUniqueId(&scope, ctx, &hccl_id);
  Prepare(&scope, ctx, &hccl_id);
  TestHCCLReduceScatterOp(&scope, ctx);
 }
--- a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 template <typename T>
 class FillConstantMKLDNNHandler
    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
 public:
  FillConstantMKLDNNHandler(Tensor* out,
                            dnnl::engine engine,
                            platform::Place cpu_place)
      : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
    const auto src0_md =
        dnnl::memory::desc({out->numel(), sizeof(T)},
                           platform::MKLDNNGetDataType<uint8_t>(),
                           dnnl::memory::format_tag::ab);
    dnnl::primitive_attr attrs;
    attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f});
    this->AcquireForwardPrimitiveDescriptor(
        attrs, dnnl::algorithm::binary_add, src0_md, src1_md, src0_md);
  }
  static const dnnl::memory::desc src1_md;
 };
 template <typename T>
 const dnnl::memory::desc FillConstantMKLDNNHandler<T>::src1_md(
    {1, sizeof(T)},
    platform::MKLDNNGetDataType<uint8_t>(),
    dnnl::memory::format_tag::ab);
 template <typename T>
 class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
    this->RunKernel(ctx);
  }
  void RunKernel(const framework::ExecutionContext& ctx) const {
    const auto& dev_ctx =
        ctx.template device_context<platform::MKLDNNDeviceContext>();
    const auto& dnnl_engine = dev_ctx.GetEngine();
    auto* out = ctx.Output<Tensor>("Out");
    T fill_value = CalculateFillValue(ctx);
    auto shape = GetShape(ctx);
    out->Resize(shape);
    FillConstantMKLDNNHandler<T> handler(out, dnnl_engine, ctx.GetPlace());
    dnnl::memory constant_value_memory =
        dnnl::memory(FillConstantMKLDNNHandler<T>::src1_md,
                     dnnl_engine,
                     reinterpret_cast<uint8_t*>(&fill_value));
    auto src0_memory_p = handler.AcquireDstMemory(out);
    auto fill_constant_p = handler.AcquireForwardPrimitive();
    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
    fill_constant_p->execute(astream,
                             {{DNNL_ARG_SRC_0, *src0_memory_p},
                              {DNNL_ARG_SRC_1, constant_value_memory},
                              {DNNL_ARG_DST, *src0_memory_p}});
    astream.wait();
    // src0_memory_p's md was just to allow the usage of a binary
    // primitive as a memset, and now we need to create a real one
    out->set_mem_desc({phi::vectorize(shape),
                       platform::MKLDNNGetDataType<T>(),
                       platform::GetPlainMKLDNNFormat(shape.size())});
  }
  T CalculateFillValue(const framework::ExecutionContext& ctx) const {
    const auto str_value = ctx.Attr<std::string>("str_value");
    const auto float_value = ctx.Attr<float>("value");
    T value;
    if (str_value.empty()) {
      value = static_cast<T>(float_value);
    } else {
      // handle NaN/Inf first, which cannot be read from stream
      if (str_value == "inf") {
        value = static_cast<T>(std::numeric_limits<float>::infinity());
      } else if (str_value == "-inf") {
        value = static_cast<T>(-std::numeric_limits<float>::infinity());
      } else if (str_value == "nan") {
        value = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
      } else {
        std::stringstream convert_stream(str_value);
        double tmp_value;
        convert_stream >> tmp_value;
        value = static_cast<T>(tmp_value);
      }
    }
    if (ctx.HasInput("ValueTensor")) {
      const auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
      PADDLE_ENFORCE_EQ(
          value_tensor->numel(),
          1,
          platform::errors::InvalidArgument(
              "When use Tensor as value to set Tensor value in fill_constant, "
              "value input(ValueTensor) size must be 1, but got %d",
              value_tensor->numel()));
      value = value_tensor->data<T>()[0];
    }
    return value;
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_KERNEL(fill_constant,
                   MKLDNN,
                   paddle::platform::CPUPlace,
                   ops::FillConstantMKLDNNKernel<float>);
--- a/paddle/fluid/operators/unbind_op.cc
+++ b/paddle/fluid/operators/unbind_op.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/unbind_op.h"
 #include <string>
 #include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 class UnbindOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE_EQ(
        ctx->HasInput("X"),
        true,
        platform::errors::NotFound("Input(X) of UnbindOp is not found."));
    PADDLE_ENFORCE_GE(
        ctx->Outputs("Out").size(),
        1UL,
        platform::errors::NotFound("Outputs(Out) of UnbindOp is not found."));
    auto in_dims = ctx->GetInputDim("X");
    auto outs_names = ctx->Outputs("Out");
    int axis = ctx->Attrs().Get<int>("axis");
    const size_t outs_number = outs_names.size();
    auto out_dims = UnbindOutsDims(in_dims, axis);
    std::vector<framework::DDim> outs_dims(outs_number, out_dims);
    ctx->SetOutputsDim("Out", outs_dims);
    for (size_t i = 0; i < outs_number; ++i) {
      ctx->ShareLoD("X", "Out", 0, i);
    }
  }
 };
 class UnbindOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X", "(Tensor) Input tensor of the split operator.");
    AddOutput("Out", "(Tensor) Output tensors of the unbind operator.")
        .AsDuplicable();
    AddComment(R"DOC(
 Unbind operator
 Remove a tensor dimension.
 Example:
  Input = [[1,2],
           [3,4],
           [5,6]]
  axis = 0
  Output[0] = [1,2]
  Output[1] = [3,4]
  Output[2] = [5,6]
    )DOC");
    AddAttr<int>("axis",
                 "(int, default 0) "
                 "dimension to remove.")
        .SetDefault(0);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(unbind,
                  ops::UnbindOp,
                  ops::UnbindOpMaker,
                  ops::UnbindGradMaker<paddle::framework::OpDesc>,
                  ops::UnbindGradMaker<paddle::imperative::OpBase>);
--- a/paddle/fluid/operators/unbind_op.h
+++ b/paddle/fluid/operators/unbind_op.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <chrono>  // NOLINT
 #include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
 namespace paddle {
 namespace operators {
 static inline framework::DDim UnbindOutsDims(const framework::DDim in_dims,
                                             int axis) {
  std::vector<int> out_dims;
  axis = axis < 0 ? in_dims.size() + axis : axis;
  for (int i = 0; i < in_dims.size(); i++) {
    if (i != axis) out_dims.push_back(in_dims[i]);
  }
  return phi::make_ddim(out_dims);
 }
 template <typename T>
 class UnbindGradMaker : public framework::SingleGradOpMaker<T> {
 public:
  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
 protected:
  void Apply(GradOpPtr<T> op) const override {
    op->SetType("stack");
    op->SetInput("X", this->OutputGrad("Out"));
    op->SetOutput("Y", this->InputGrad("X"));
    op->SetAttrMap(this->Attrs());
  }
 };
 }  // namespace operators
 }  // namespace paddle