fix NPUDeviceContext in all c++ unittest (#32198)

* fix NPUDeviceContext in all c++ unittest * refine log Co-authored-by: N pangyoki <pangyoki@126.com>

fix NPUDeviceContext in all c++ unittest (#32198)
* fix NPUDeviceContext in all c++ unittest * refine log Co-authored-by: N pangyoki <pangyoki@126.com>
5ad94e7b · Leo Chen · GitHub · 054f8e7a · 5ad94e7b · 5ad94e7b
24 changed file
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -120,12 +120,12 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
 TEST(check_finite_and_unscale, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  Compare<float>(&scope, *ctx);
 }
 TEST(check_finite_and_unscale, NPU_fp16) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx);
+  Compare<p::float16>(&scope, *ctx);
 }
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -56,10 +56,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
  auto out = scope->Var("Out");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  auto op = f::OpRegistry::CreateOp(op_type,
+  auto op =
-                    {{"X", {"X"}}},
+      f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
-                    {{"Out", {"Out"}}},
-                    {});
  op->Run(*scope, place);
@@ -75,11 +73,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
  EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
 }
 TEST(assign, NPU_fp32) {
  f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "assign");
+  Compare<float>(&scope, *ctx, "assign");
 }
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -16,23 +16,23 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -50,22 +50,20 @@ USE_OP_DEVICE_KERNEL(c_allgather, NPU);
 DECLARE_string(selected_npus);
-template<typename T>
+template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-  << "; device_id = " << device_id  
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
@@ -112,14 +110,14 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // run
  f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
+  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"]=0;
+  attrs["ring_id"] = 0;
-  attrs["nranks"]=2;
+  attrs["nranks"] = 2;
  auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
                                    {{"Out", {"Out"}}}, attrs);
-  for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
@@ -139,13 +137,13 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
  }
 }
 TEST(c_allgather, NPU) {
  f::Scope scope;
  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
-  TestHCCLAllGatherOp(&scope, ctx);
+  TestHCCLAllGatherOp(&scope, *ctx);
 }
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -16,23 +16,23 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -50,22 +50,20 @@ USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
 DECLARE_string(selected_npus);
-template<typename T>
+template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring;
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-  << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
@@ -112,13 +110,13 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // run
  f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
+  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"]=0;
+  attrs["ring_id"] = 0;
  auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"X"}}},
                                    {{"Out", {"Out"}}}, attrs);
-  for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
@@ -139,8 +137,9 @@ TEST(c_allreduce_max, NPU) {
  f::Scope scope;
  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
-  TestHCCLAllReduceOp(&scope, ctx);
+  TestHCCLAllReduceOp(&scope, *ctx);
 }
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -16,19 +16,19 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
@@ -47,22 +47,20 @@ USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
 DECLARE_string(selected_npus);
-template<typename T>
+template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(3) << preStr << ":" << std::endl <<debugstring;
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-  << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
@@ -80,7 +78,8 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
  ctx.Wait();
 }
-void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
+                         int iter) {
  // init
  auto x = scope->Var("X");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
@@ -109,15 +108,13 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter)
  // run
  f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx_"+ std::to_string(iter));
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
-  attrs["ring_id"]=0;
+  attrs["ring_id"] = 0;
-  auto op = f::OpRegistry::CreateOp("c_allreduce_sum",
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"X"}}},
-                                    {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
-                                    {{"Out", {"Out"}}},
-                                    attrs);
-  for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
@@ -138,11 +135,12 @@ TEST(c_allreduce_sum, NPU) {
  f::Scope scope;
  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
-  for(int i = 0; i < 1; i ++){
+  for (int i = 0; i < 1; i++) {
    VLOG(2) << "iter num: " << i;
-    TestHCCLAllReduceOp(&scope, ctx, i);
+    TestHCCLAllReduceOp(&scope, *ctx, i);
  }
 }
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -16,19 +16,19 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
@@ -47,22 +47,20 @@ USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
 DECLARE_string(selected_npus);
-template<typename T>
+template <typename T>
-void PrintDebugInfo(const std::string preStr, const  std::vector<T> &data){
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-  << "; device_id = " << device_id  
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
@@ -106,14 +104,14 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // run
  f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
+  attrs["tag"] = std::string("tagx");
-  attrs["root"]=0;
+  attrs["root"] = 0;
-  attrs["ring_id"]=0;
+  attrs["ring_id"] = 0;
  auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
                                    {{"Out", {"Out"}}}, attrs);
-  for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
@@ -133,8 +131,9 @@ TEST(c_broadcast, NPU) {
  f::Scope scope;
  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
-  TestHCCLBroadcastOp(&scope, ctx);
+  TestHCCLBroadcastOp(&scope, *ctx);
 }
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -16,19 +16,19 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
@@ -47,22 +47,20 @@ USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
 DECLARE_string(selected_npus);
-template<typename T>
+template <typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(3) << preStr << ":" << std::endl <<debugstring;
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-  << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
@@ -109,15 +107,13 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  // run
  f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx_"+ std::to_string(iter));
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
-  attrs["ring_id"]=0;
+  attrs["ring_id"] = 0;
  int root_id = 0;
-  attrs["root_id"]=root_id;
+  attrs["root_id"] = root_id;
-  auto op = f::OpRegistry::CreateOp("c_reduce_sum",
+  auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"X"}}},
-                                    {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
-                                    {{"Out", {"Out"}}},
-                                    attrs);
  op->Run(*scope, place);
  ctx.Wait();
@@ -130,10 +126,9 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
  EXPECT_EQ(out_vec.size(), init.size());
  for (uint32_t i = 0; i < out_vec.size(); i++) {
-    if(rank_id == root_id){
+    if (rank_id == root_id) {
      EXPECT_EQ(out_vec[i], 3.0);
-    }
+    } else {
-    else{
      EXPECT_EQ(out_vec[i], init[i]);
    }
  }
@@ -143,11 +138,12 @@ TEST(c_reduce_sum, NPU) {
  f::Scope scope;
  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
-  for(int i = 0; i < 2; i ++){
+  for (int i = 0; i < 2; i++) {
    VLOG(2) << "iter num: " << i;
-    TestHCCLReduceOp(&scope, ctx, i);
+    TestHCCLReduceOp(&scope, *ctx, i);
  }
 }
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -16,23 +16,23 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -50,22 +50,20 @@ USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
 DECLARE_string(selected_npus);
-template<typename T>
+template <typename T>
-void PrintDebugInfo(const std::string preStr, const  std::vector<T> &data){
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
  std::string debugstring = "";
  for (auto ele : data) {
    debugstring += std::to_string(ele) + std::string(",");
  }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring;
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
-  VLOG(2) << "rank_id = " << rank_id
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
-  << "; device_id = " << device_id
          << "; rank_id = " << rank_id
          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
@@ -112,15 +110,15 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
  // run
  f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
+  attrs["tag"] = std::string("tagx");
-  attrs["ring_id"]=0;
+  attrs["ring_id"] = 0;
-  attrs["nranks"]=2;
+  attrs["nranks"] = 2;
  auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"X"}}},
                                    {{"Out", {"Out"}}}, attrs);
  int iter_num = 10;
-  for (int i = 0; i < iter_num; i ++) {
+  for (int i = 0; i < iter_num; i++) {
    op->Run(*scope, place);
  }
  ctx.Wait();
@@ -140,8 +138,9 @@ TEST(c_reducescatter, NPU) {
  f::Scope scope;
  // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
-  Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
-  TestHCCLReduceScatterOp(&scope, ctx);
+  TestHCCLReduceScatterOp(&scope, *ctx);
 }
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -102,6 +102,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(c_sync_calc_stream, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  Compare<float>(&scope, *ctx);
 }
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -124,8 +124,8 @@ TEST(c_broadcast, NPU) {
  f::Scope scope;
  char* npu_id = getenv("FLAGS_selected_npus");
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
-  Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
-  TestHCCLBroadcastOp(&scope, ctx);
+  TestHCCLBroadcastOp(&scope, *ctx);
 }
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -16,19 +16,19 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
@@ -45,37 +45,38 @@ USE_OP(recv_v2);
 USE_NO_KERNEL_OP(c_comm_init_hcom);
 USE_OP_DEVICE_KERNEL(recv_v2, NPU);
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
  std::string rank_table_file = getenv("RANK_TABLE_FILE");
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  int src_rank = atoi(getenv("SRC_RANK"));
  int dest_rank = atoi(getenv("DEST_RANK"));
-    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
+  VLOG(3) << "rank_id " << rank_id << "src_rank" << src_rank << "dest_rank"
+          << dest_rank;
-    std::vector<int> rank_ids = {0,1};
+  std::vector<int> rank_ids = {0, 1};
  f::AttributeMap comm_init_attrs;
  comm_init_attrs["ring_id"] = 0;
  comm_init_attrs["nranks"] = 2;
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
  comm_init_attrs["rank_ids"] = rank_ids;
-    auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
  VLOG(3) << "CreateOp c_comm_init_hcom";
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
-void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
  int num = atoi(getenv("DATA_SIZE"));
  EXPECT_GT(num, 0);
  EXPECT_LT(num, 1 << 15);
  int rank_id = atoi(getenv("RANK_ID"));
-    VLOG(3) << "rank_id:" << rank_id<<std::endl;
+  VLOG(3) << "rank_id:" << rank_id << std::endl;
  ctx.Wait();
  auto place = ctx.GetPlace();
@@ -87,38 +88,37 @@ void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
  ctx.Wait();
  f::AttributeMap attrs;
-    attrs["tag"]=std::string("srtest");
+  attrs["tag"] = std::string("srtest");
-    attrs["peer"]=atoi(getenv("SRC_RANK"));
+  attrs["peer"] = atoi(getenv("SRC_RANK"));
-    attrs["ring_id"]=0;
+  attrs["ring_id"] = 0;
-    attrs["srTag"]=0;
+  attrs["srTag"] = 0;
  std::vector<int> out_shape;
  out_shape.push_back(num);
  out_shape.push_back(num);
-    attrs["out_shape"]=out_shape;
+  attrs["out_shape"] = out_shape;
  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
  VLOG(3) << "CreateOp recv_v2";
-    for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
  VLOG(3) << "Run op recv_v2";
  std::vector<float> out_vec;
  TensorToVector(*tensor_out, ctx, &out_vec);
  ctx.Wait();
-    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
  EXPECT_EQ(out_vec == init, true);
 }
+TEST(recv_v2, NPU) {
-TEST(recv_v2, NPU){
  f::Scope scope;
-    char * npu_id=getenv("FLAGS_selected_npus");
+  char* npu_id = getenv("FLAGS_selected_npus");
  VLOG(3) << "Select npu:" << npu_id;
-    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
  VLOG(3) << "Place over";
-    Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
  VLOG(3) << "Prepare over";
-    TestHcomRecvOp(&scope, ctx);
+  TestHcomRecvOp(&scope, *ctx);
  VLOG(3) << "Test over";
 }
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -16,18 +16,18 @@ limitations under the License. */
 #include <unistd.h>
 #endif
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 #include "paddle/fluid/operators/collective/send_v2_op.h"
@@ -44,14 +44,14 @@ USE_OP(send_v2);
 USE_NO_KERNEL_OP(c_comm_init_hcom);
 USE_OP_DEVICE_KERNEL(send_v2, NPU);
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
  std::string rank_table_file = getenv("RANK_TABLE_FILE");
  int rank_id = atoi(getenv("RANK_ID"));
  int device_id = atoi(getenv("DEVICE_ID"));
  int src_rank = atoi(getenv("SRC_RANK"));
  int dest_rank = atoi(getenv("DEST_RANK"));
-    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
+  VLOG(3) << "rank_id " << rank_id << "src_rank" << src_rank << "dest_rank"
+          << dest_rank;
  std::vector<int> rank_ids = {0, 1};
  f::AttributeMap comm_init_attrs;
@@ -60,22 +60,24 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
  comm_init_attrs["rank"] = rank_id;
  comm_init_attrs["device_id"] = device_id;
  comm_init_attrs["rank_ids"] = rank_ids;
-    auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
  auto place = ctx.GetPlace();
  comm_init_op->Run(*scope, place);
  ctx.Wait();
 }
-void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
-    std::cout<< "BEGIN TEST:"<< __FUNCTION__ <<std::endl;
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
  auto x = scope->Var("X");
  auto tensor_x = x->GetMutable<f::LoDTensor>();
-    int num = atoi(getenv("DATA_SIZE"));;
+  int num = atoi(getenv("DATA_SIZE"));
  EXPECT_GT(num, 0);
  EXPECT_LT(num, 1 << 15);
-    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
  int rank_id = atoi(getenv("RANK_ID"));
-    VLOG(3)<<"rank id:"<<rank_id;
+  VLOG(3) << "rank id:" << rank_id;
  TensorFromVector(init, ctx, tensor_x);
  tensor_x->Resize({num, num});
  ctx.Wait();
@@ -83,29 +85,28 @@ void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
  ctx.Wait();
  f::AttributeMap attrs;
-    attrs["tag"]=std::string("srtest");
+  attrs["tag"] = std::string("srtest");
-    attrs["peer"]=atoi(getenv("DEST_RANK"));
+  attrs["peer"] = atoi(getenv("DEST_RANK"));
-    attrs["ring_id"]=0;
+  attrs["ring_id"] = 0;
-    attrs["srTag"]=0;
+  attrs["srTag"] = 0;
  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
-    for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
    op->Run(*scope, place);
  }
-    VLOG(3)<<"send run over";
+  VLOG(3) << "send run over";
  ctx.Wait();
 }
-TEST(send_v2, NPU){
+TEST(send_v2, NPU) {
  f::Scope scope;
-    char * npu_id=getenv("FLAGS_selected_npus");
+  char* npu_id = getenv("FLAGS_selected_npus");
  VLOG(3) << "Select npu:" << npu_id;
-    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
  VLOG(3) << "Place over";
-    Prepare(&scope, ctx);
+  Prepare(&scope, *ctx);
  VLOG(3) << "Prepare over";
-    TestHcomSendOp(&scope, ctx);
+  TestHcomSendOp(&scope, *ctx);
  VLOG(3) << "Test over";
 }
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -38,7 +38,7 @@ USE_OP(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
 template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+void Compare(f::Scope *scope, const p::DeviceContext &ctx,
             std::string op_type) {
  // init
  auto x = scope->Var("X");
@@ -90,7 +90,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
 }
 template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
                 std::string op_type) {
  // init
  auto dout = scope->Var("DOut");
@@ -154,30 +154,30 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 TEST(elementwise_add, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "elementwise_add");
+  Compare<float>(&scope, *ctx, "elementwise_add");
 }
 TEST(elementwise_sub, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "elementwise_sub");
+  Compare<float>(&scope, *ctx, "elementwise_sub");
 }
 TEST(elementwise_sub, NPU_fp16) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx, "elementwise_sub");
+  Compare<p::float16>(&scope, *ctx, "elementwise_sub");
 }
 TEST(elementwise_sub_grad, NPU) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+  CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
 }
 TEST(elementwise_add_grad, NPU) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
+  CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
 }
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -69,6 +69,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(expand, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  Compare<float>(&scope, *ctx);
 }
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -152,18 +152,18 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 TEST(gather, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "gather");
+  Compare<float>(&scope, *ctx, "gather");
 }
 TEST(gather, NPU_fp16) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx, "gather");
+  Compare<p::float16>(&scope, *ctx, "gather");
 }
 TEST(gather_grad, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "gather_grad");
+  CompareGrad<float>(&scope, *ctx, "gather_grad");
 }
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -59,8 +59,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  // run
  auto place = ctx.GetPlace();
-  auto op = f::OpRegistry::CreateOp("gelu", {{"X", {"X"}}},
+  auto op = f::OpRegistry::CreateOp("gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+                                    attrs);
  op->Run(*scope, place);
  ctx.Wait();
@@ -76,8 +76,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();
  gettimeofday(&end, NULL);
-  int micros = (((end.tv_sec - start.tv_sec) * 1000000) +
+  int micros =
-                  end.tv_usec) - (start.tv_usec);
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
  printf("used time: %d\n", micros / 100);
  // eval value
@@ -141,8 +141,8 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
  ctx.Wait();
  gettimeofday(&end, NULL);
-  int micros = (((end.tv_sec - start.tv_sec) * 1000000) +
+  int micros =
-                  end.tv_usec) - (start.tv_usec);
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
  printf("used time: %d\n", micros / 100);
  // eval value
@@ -157,13 +157,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(gelu, NPU_fp32) {
  f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-    Compare<float>(&scope, ctx);
+  Compare<float>(&scope, *ctx);
 }
 TEST(gelu_grad, NPU) {
  f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-    CompareGrad<float>(&scope, ctx);
+  CompareGrad<float>(&scope, *ctx);
 }
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -54,10 +54,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
  auto out = scope->Var("Out");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
-  f::AttributeMap attr_input = { {"step", static_cast<float>(2.0)} };
+  f::AttributeMap attr_input = {{"step", static_cast<float>(2.0)}};
  auto op = f::OpRegistry::CreateOp("increment", {{"X", {"X"}}},
-                                                 {{"Out", {"Out"}}},
+                                    {{"Out", {"Out"}}}, attr_input);
-                                                 attr_input);
  op->Run(*scope, place);
@@ -70,16 +69,14 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
  EXPECT_EQ(out_vec[0], static_cast<T>(3.0));
 }
 TEST(increment, NPU_fp32) {
  f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "increment");
+  Compare<float>(&scope, *ctx, "increment");
 }
 TEST(increment, NPU_fp64) {
  f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "increment");
+  Compare<double>(&scope, *ctx, "increment");
 }
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -67,9 +67,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
  auto tensor_out = out->GetMutable<f::LoDTensor>();
  // run
-  auto op = f::OpRegistry::CreateOp(op_type, {{"Start", {"Start"}},
+  auto op = f::OpRegistry::CreateOp(
-                                              {"End", {"End"}},
+      op_type, {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}},
-                                              {"Step", {"Step"}}},
      {{"Out", {"Out"}}}, {});
  op->Run(*scope, place);
@@ -86,10 +85,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
  EXPECT_EQ(static_cast<T>(out_vec[4]), static_cast<T>(9.0));
 }
 TEST(range, NPU) {
  f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-    Compare<int>(&scope, ctx, "range");
+  Compare<int>(&scope, *ctx, "range");
 }
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -78,6 +78,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(reduce_any, NPU) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<bool>(&scope, ctx);
+  Compare<bool>(&scope, *ctx);
 }
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -21,11 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/tensor_util.h"
 namespace f = paddle::framework;
 namespace p = paddle::platform;
@@ -59,14 +58,12 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  // run
  int axis = 1;
  f::AttributeMap attrs = {
-            {"axis", axis},
+      {"axis", axis},        {"use_cudnn", false},
-            {"use_cudnn", false},
+      {"use_mkldnn", false}, {"mkldnn_data_type", std::string("float32")},
-            {"use_mkldnn", false},
+      {"is_test", false},
-            {"mkldnn_data_type", std::string("float32")},
+  };
-            {"is_test", false}, };
-  auto op =
+  auto op = f::OpRegistry::CreateOp("softmax", {{"X", {"X"}}},
-      f::OpRegistry::CreateOp("softmax", {{"X", {"X"}}},
                                    {{"Out", {"Out"}}}, attrs);
  op->Run(*scope, place);
@@ -76,7 +73,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  TensorToVector(*tensor_out, ctx, &out_vec);
  for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
-       VLOG(3) << "out_vec[" << i << "] : "<< out_vec[i];
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
  }
  ctx.Wait();
@@ -84,7 +81,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
 }
 template <typename T>
 void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
  // init
@@ -132,11 +128,10 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
      {"use_mkldnn", false},
      {"mkldnn_data_type", std::string("float32")},
      {"is_test", false},
-            {"data_format", std::string("AnyLayout")}, };
+      {"data_format", std::string("AnyLayout")},
-  auto op =
+  };
-      f::OpRegistry::CreateOp("softmax_grad",
+  auto op = f::OpRegistry::CreateOp("softmax_grad",
-                              {{"Out", {"Out"}},
+                                    {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}},
-                               {"Out@GRAD", {"DOut"}}},
                                    {{"X@GRAD", {"DX"}}}, attrs);
  auto place = ctx.GetPlace();
@@ -164,12 +159,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(softmax, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  Compare<float>(&scope, *ctx);
 }
 TEST(softmax_grad, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
+  CompareGrad<float>(&scope, *ctx);
 }
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -64,8 +64,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  axis.push_back(2);
  f::AttributeMap attrs = {{"axes", axis}};
-  auto op =
+  auto op = f::OpRegistry::CreateOp("squeeze", {{"X", {"X"}}},
-      f::OpRegistry::CreateOp("squeeze", {{"X", {"X"}}},
                                    {{"Out", {"Out"}}}, attrs);
  op->Run(*scope, place);
@@ -86,7 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(squeeze, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  Compare<float>(&scope, *ctx);
 }
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -126,12 +126,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(transpose2, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  Compare<float>(&scope, *ctx);
 }
 TEST(transpose2_grad, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
+  CompareGrad<float>(&scope, *ctx);
 }
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -63,8 +63,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
  axis.push_back(1);
  f::AttributeMap attrs = {{"axes", axis}};
-  auto op =
+  auto op = f::OpRegistry::CreateOp("unsqueeze", {{"X", {"X"}}},
-      f::OpRegistry::CreateOp("unsqueeze", {{"X", {"X"}}},
                                    {{"Out", {"Out"}}}, attrs);
  op->Run(*scope, place);
@@ -86,7 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(unsqueeze, NPU_fp32) {
  f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  Compare<float>(&scope, *ctx);
 }
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -254,7 +254,7 @@ NPUDeviceContext::~NPUDeviceContext() {
 void NPUDeviceContext::Wait() const {
  platform::RecordEvent record_event("NPUDeviceContext/wait");
-  VLOG(4) << "NPU context Wait";
+  VLOG(4) << "NPU context(" << this << ")  Wait";
  stream_->Wait();
 }