diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index 1ed188b15939ee2542299d7cb6d02ce3b5d05dc1..a80b83f0cbe51fe536955b047d7be1b4c451a5a9 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -120,12 +120,12 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) {
 
 TEST(check_finite_and_unscale, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
 
 TEST(check_finite_and_unscale, NPU_fp16) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx);
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 111f4b177b9b0c2ab1952e97bafc1e34d040d94a..792d01a5efe43034c201a57641cf3dc1b4c38e4c 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -56,10 +56,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   auto out = scope->Var("Out");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
 
-  auto op = f::OpRegistry::CreateOp(op_type,
-                    {{"X", {"X"}}},
-                    {{"Out", {"Out"}}},
-                    {});
+  auto op =
+      f::OpRegistry::CreateOp(op_type, {{"X", {"X"}}}, {{"Out", {"Out"}}}, {});
 
   op->Run(*scope, place);
 
@@ -75,11 +73,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   EXPECT_EQ(out_vec[3], static_cast<T>(4.0));
 }
 
-
 TEST(assign, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "assign");
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "assign");
 }
-
-
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index 38f19170af9589bde81a8aa9786079ff63849fc8..f8b30b25516f1b016b7a0949090d69fdceae2055 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -16,23 +16,23 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -50,25 +50,23 @@ USE_OP_DEVICE_KERNEL(c_allgather, NPU);
 
 DECLARE_string(selected_npus);
 
-template<typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
-
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
-  VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id  
-  << "; rank_id = " << rank_id  
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));  
-  
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+
   std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
@@ -90,7 +88,7 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
   std::vector<float> init;
   int rank_id = atoi(getenv("RANK_ID"));
-  
+
   int num1 = 1;
   int num2 = 4;
 
@@ -112,18 +110,18 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
   // run
   f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
-  attrs["ring_id"]=0;
-  attrs["nranks"]=2;
-  
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
+
   auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+                                    {{"Out", {"Out"}}}, attrs);
 
-  for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
-  
+
   std::vector<float> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
@@ -139,13 +137,13 @@ void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) {
   }
 }
 
-
 TEST(c_allgather, NPU) {
   f::Scope scope;
 
-  // only support one device, if more than one device, use first default  
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  // only support one device, if more than one device, use first default
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
-  TestHCCLAllGatherOp(&scope, ctx);
+  Prepare(&scope, *ctx);
+  TestHCCLAllGatherOp(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index 3fdc859506754bb4107d642a12f7d6d1b6d44662..c00c3a9ea1604eeeba930492c0d98c0cbd8abee7 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -16,23 +16,23 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -50,24 +50,22 @@ USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU);
 
 DECLARE_string(selected_npus);
 
-template<typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring;
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
-
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
-  VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id
-  << "; rank_id = " << rank_id
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
 
   std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
@@ -112,13 +110,13 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
   // run
   f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
-  attrs["ring_id"]=0;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
 
   auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+                                    {{"Out", {"Out"}}}, attrs);
 
-  for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
@@ -139,8 +137,9 @@ TEST(c_allreduce_max, NPU) {
   f::Scope scope;
 
   // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
-  TestHCCLAllReduceOp(&scope, ctx);
+  Prepare(&scope, *ctx);
+  TestHCCLAllReduceOp(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 2fff84593c00829fd3ad519e2c9c34dbc663459d..2bfab0ee7379f6711d15eb07a001a7b3e54c7130 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -16,19 +16,19 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
@@ -47,24 +47,22 @@ USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
 
 DECLARE_string(selected_npus);
 
-template<typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(3) << preStr << ":" << std::endl <<debugstring;
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
-
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
-  VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id
-  << "; rank_id = " << rank_id
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
 
   std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
@@ -80,7 +78,8 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
   ctx.Wait();
 }
 
-void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
+void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx,
+                         int iter) {
   // init
   auto x = scope->Var("X");
   auto tensor_x = x->GetMutable<f::LoDTensor>();
@@ -109,15 +108,13 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter)
 
   // run
   f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx_"+ std::to_string(iter));
-  attrs["ring_id"]=0;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
 
-  auto op = f::OpRegistry::CreateOp("c_allreduce_sum",
-                                    {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}},
-                                    attrs);
+  auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
 
-  for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
@@ -138,11 +135,12 @@ TEST(c_allreduce_sum, NPU) {
   f::Scope scope;
 
   // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
-  for(int i = 0; i < 1; i ++){
+  Prepare(&scope, *ctx);
+  for (int i = 0; i < 1; i++) {
     VLOG(2) << "iter num: " << i;
-    TestHCCLAllReduceOp(&scope, ctx, i);
+    TestHCCLAllReduceOp(&scope, *ctx, i);
   }
 }
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 66158e5ff28ae5e7ced4a9f7d7fafc8c19d57d02..ccffe36681b4f24169021333cfa16eef1002f586 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -16,19 +16,19 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 
@@ -47,25 +47,23 @@ USE_OP_DEVICE_KERNEL(c_broadcast, NPU);
 
 DECLARE_string(selected_npus);
 
-template<typename T>
-void PrintDebugInfo(const std::string preStr, const  std::vector<T> &data){
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring; 
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
-
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
-  VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id  
-  << "; rank_id = " << rank_id  
-  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));  
-  
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+
   std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
   comm_init_attrs["ring_id"] = 0;
@@ -87,7 +85,7 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
   int num = 2;
   std::vector<float> init;
   int rank_id = atoi(getenv("RANK_ID"));
-  
+
   for (int64_t i = 0; i < num * num; ++i) {
     init.push_back(1.0 + rank_id);
   }
@@ -106,18 +104,18 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
   // run
   f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
-  attrs["root"]=0;
-  attrs["ring_id"]=0;
+  attrs["tag"] = std::string("tagx");
+  attrs["root"] = 0;
+  attrs["ring_id"] = 0;
 
   auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+                                    {{"Out", {"Out"}}}, attrs);
 
-  for (int i = 0; i < 10; i ++) {
+  for (int i = 0; i < 10; i++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
-  
+
   std::vector<float> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   ctx.Wait();
@@ -132,9 +130,10 @@ void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) {
 TEST(c_broadcast, NPU) {
   f::Scope scope;
 
-  // only support one device, if more than one device, use first default  
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  // only support one device, if more than one device, use first default
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
-  TestHCCLBroadcastOp(&scope, ctx);
+  Prepare(&scope, *ctx);
+  TestHCCLBroadcastOp(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index 36ec6d155a214590e1ba5a1b54d3de68d46d495b..8d1da15c8c718a90d05b838d8b1ec13e62b873a2 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -16,19 +16,19 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 
@@ -47,24 +47,22 @@ USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU);
 
 DECLARE_string(selected_npus);
 
-template<typename T>
-void PrintDebugInfo(const std::string preStr, const std::vector<T> &data){
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(3) << preStr << ":" << std::endl <<debugstring;
+  VLOG(3) << preStr << ":" << std::endl << debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
-
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
-  VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id
-  << "; rank_id = " << rank_id
-  << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("RANK_TABLE_FILE"));
 
   std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
@@ -109,15 +107,13 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
 
   // run
   f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx_"+ std::to_string(iter));
-  attrs["ring_id"]=0;
+  attrs["tag"] = std::string("tagx_" + std::to_string(iter));
+  attrs["ring_id"] = 0;
   int root_id = 0;
-  attrs["root_id"]=root_id;
+  attrs["root_id"] = root_id;
 
-  auto op = f::OpRegistry::CreateOp("c_reduce_sum",
-                                    {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}},
-                                    attrs);
+  auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
 
   op->Run(*scope, place);
   ctx.Wait();
@@ -130,10 +126,9 @@ void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) {
 
   EXPECT_EQ(out_vec.size(), init.size());
   for (uint32_t i = 0; i < out_vec.size(); i++) {
-    if(rank_id == root_id){
+    if (rank_id == root_id) {
       EXPECT_EQ(out_vec[i], 3.0);
-    }
-    else{
+    } else {
       EXPECT_EQ(out_vec[i], init[i]);
     }
   }
@@ -143,11 +138,12 @@ TEST(c_reduce_sum, NPU) {
   f::Scope scope;
 
   // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
-  for(int i = 0; i < 2; i ++){
+  Prepare(&scope, *ctx);
+  for (int i = 0; i < 2; i++) {
     VLOG(2) << "iter num: " << i;
-    TestHCCLReduceOp(&scope, ctx, i);
+    TestHCCLReduceOp(&scope, *ctx, i);
   }
 }
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index 1c21ab19b954c94df2a75c1514e6a2b101694213..d116b6a7d8cdc92362a8dd58cbf30db44b3ac1e5 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -16,23 +16,23 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 
-#include "paddle/fluid/operators/collective/c_broadcast_op.h"
-#include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/collective/c_allreduce_op.h"
+#include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/c_reducescatter_op.h"
 
 #if defined(PADDLE_WITH_ASCEND_CL)
@@ -50,24 +50,22 @@ USE_OP_DEVICE_KERNEL(c_reducescatter, NPU);
 
 DECLARE_string(selected_npus);
 
-template<typename T>
-void PrintDebugInfo(const std::string preStr, const  std::vector<T> &data){
+template <typename T>
+void PrintDebugInfo(const std::string preStr, const std::vector<T>& data) {
   std::string debugstring = "";
   for (auto ele : data) {
     debugstring += std::to_string(ele) + std::string(",");
   }
-  VLOG(2) << preStr << ":" << std::endl <<debugstring;
+  VLOG(2) << preStr << ":" << std::endl << debugstring;
 }
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
-
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
   int rank_id = atoi(getenv("RANK_ID"));
   int device_id = atoi(getenv("DEVICE_ID"));
 
-  VLOG(2) << "rank_id = " << rank_id
-  << "; device_id = " << device_id
-  << "; rank_id = " << rank_id
-  << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
+  VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id
+          << "; rank_id = " << rank_id
+          << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID"));
 
   std::vector<int> rank_ids{0, 1};
   f::AttributeMap comm_init_attrs;
@@ -112,15 +110,15 @@ void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) {
 
   // run
   f::AttributeMap attrs;
-  attrs["tag"]=std::string("tagx");
-  attrs["ring_id"]=0;
-  attrs["nranks"]=2;
+  attrs["tag"] = std::string("tagx");
+  attrs["ring_id"] = 0;
+  attrs["nranks"] = 2;
 
   auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+                                    {{"Out", {"Out"}}}, attrs);
 
   int iter_num = 10;
-  for (int i = 0; i < iter_num; i ++) {
+  for (int i = 0; i < iter_num; i++) {
     op->Run(*scope, place);
   }
   ctx.Wait();
@@ -140,8 +138,9 @@ TEST(c_reducescatter, NPU) {
   f::Scope scope;
 
   // only support one device, if more than one device, use first default
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
+  auto* ctx = p::DeviceContextPool::Instance().Get(
+      p::NPUPlace(atoi(FLAGS_selected_npus.c_str())));
 
-  Prepare(&scope, ctx);
-  TestHCCLReduceScatterOp(&scope, ctx);
+  Prepare(&scope, *ctx);
+  TestHCCLReduceScatterOp(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
index 4b1f7bb340178748d302f9ec5a5c987a25dae2e3..94d89356d4a0e63b0afbd1cfd8d955a26d02bb0f 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -102,6 +102,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(c_sync_calc_stream, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index 371bcc4cfcfef427ed0efc09a3ba808d74b01577..a8e61398ca5b035684110d1cbd726907cb4dec04 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -124,8 +124,8 @@ TEST(c_broadcast, NPU) {
   f::Scope scope;
   char* npu_id = getenv("FLAGS_selected_npus");
 
-  p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
 
-  Prepare(&scope, ctx);
-  TestHCCLBroadcastOp(&scope, ctx);
+  Prepare(&scope, *ctx);
+  TestHCCLBroadcastOp(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index 727d8be5a8f9ae4f7190bd283c927cdb42408ef5..0067ebcb55d474152ab0aff7595f9879dfdf9399 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -16,19 +16,19 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
 
@@ -45,80 +45,80 @@ USE_OP(recv_v2);
 USE_NO_KERNEL_OP(c_comm_init_hcom);
 USE_OP_DEVICE_KERNEL(recv_v2, NPU);
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
-
-    std::string rank_table_file = getenv("RANK_TABLE_FILE");
-    int rank_id = atoi(getenv("RANK_ID"));
-    int device_id = atoi(getenv("DEVICE_ID"));
-    int src_rank = atoi(getenv("SRC_RANK"));
-    int dest_rank = atoi(getenv("DEST_RANK"));
-    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
-
-    std::vector<int> rank_ids = {0,1};
-    f::AttributeMap comm_init_attrs;
-    comm_init_attrs["ring_id"] = 0;
-    comm_init_attrs["nranks"] = 2;
-    comm_init_attrs["rank"] = rank_id;
-    comm_init_attrs["device_id"] = device_id;
-    comm_init_attrs["rank_ids"] = rank_ids;
-    auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
-    VLOG(3) << "CreateOp c_comm_init_hcom";
-    auto place = ctx.GetPlace();
-    comm_init_op->Run(*scope, place);
-    ctx.Wait();
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::string rank_table_file = getenv("RANK_TABLE_FILE");
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+  int src_rank = atoi(getenv("SRC_RANK"));
+  int dest_rank = atoi(getenv("DEST_RANK"));
+  VLOG(3) << "rank_id " << rank_id << "src_rank" << src_rank << "dest_rank"
+          << dest_rank;
+
+  std::vector<int> rank_ids = {0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+  VLOG(3) << "CreateOp c_comm_init_hcom";
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
 }
 
-void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx){
-    std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
-
-    int num = atoi(getenv("DATA_SIZE"));
-    EXPECT_GT(num, 0);
-    EXPECT_LT(num, 1 << 15);
-    int rank_id = atoi(getenv("RANK_ID"));
-    VLOG(3) << "rank_id:" << rank_id<<std::endl;
-
-    ctx.Wait();
-    auto place = ctx.GetPlace();
-    auto out = scope->Var("Out");
-    auto tensor_out = out->GetMutable<f::LoDTensor>();
-    tensor_out->Resize({num, num});
-    tensor_out->mutable_data<float>(place);  // allocate
-
-    ctx.Wait();
-
-    f::AttributeMap attrs;
-    attrs["tag"]=std::string("srtest");
-    attrs["peer"]=atoi(getenv("SRC_RANK"));
-    attrs["ring_id"]=0;
-    attrs["srTag"]=0;
-    std::vector<int> out_shape;
-    out_shape.push_back(num);
-    out_shape.push_back(num);
-    attrs["out_shape"]=out_shape;
-
-    auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
-    VLOG(3) << "CreateOp recv_v2";
-
-    for (int i = 0; i < 10; i ++) {
-      op->Run(*scope, place);
-    }
-    VLOG(3) << "Run op recv_v2";
-    std::vector<float> out_vec;
-    TensorToVector(*tensor_out, ctx, &out_vec);
-    ctx.Wait();
-    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
-    EXPECT_EQ(out_vec == init, true);
+void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+
+  int num = atoi(getenv("DATA_SIZE"));
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank_id:" << rank_id << std::endl;
+
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+  tensor_out->Resize({num, num});
+  tensor_out->mutable_data<float>(place);  // allocate
+
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("SRC_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+  std::vector<int> out_shape;
+  out_shape.push_back(num);
+  out_shape.push_back(num);
+  attrs["out_shape"] = out_shape;
+
+  auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Out"}}}, attrs);
+  VLOG(3) << "CreateOp recv_v2";
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "Run op recv_v2";
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+  ctx.Wait();
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  EXPECT_EQ(out_vec == init, true);
 }
 
-
-TEST(recv_v2, NPU){
-    f::Scope scope;
-    char * npu_id=getenv("FLAGS_selected_npus");
-    VLOG(3) << "Select npu:" << npu_id;
-    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-    VLOG(3) << "Place over";
-    Prepare(&scope, ctx);
-    VLOG(3) << "Prepare over";
-    TestHcomRecvOp(&scope, ctx);
-    VLOG(3) << "Test over";
+TEST(recv_v2, NPU) {
+  f::Scope scope;
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
+  VLOG(3) << "Place over";
+  Prepare(&scope, *ctx);
+  VLOG(3) << "Prepare over";
+  TestHcomRecvOp(&scope, *ctx);
+  VLOG(3) << "Test over";
 }
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index 7916d155ee7617aa513359070d04f0a00b262339..3bb208e0441a7cb39e4d1ea15037264de0b55ad1 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -16,18 +16,18 @@ limitations under the License. */
 #include <unistd.h>
 #endif
 
+#include <stdio.h>
 #include <string>
 #include <thread>  // NOLINT
 #include <vector>
-#include <stdio.h>
 #include "gtest/gtest.h"
 
-#include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
 
 #include "paddle/fluid/operators/collective/send_v2_op.h"
 
@@ -44,68 +44,69 @@ USE_OP(send_v2);
 USE_NO_KERNEL_OP(c_comm_init_hcom);
 USE_OP_DEVICE_KERNEL(send_v2, NPU);
 
-void Prepare(f::Scope* scope, const p::DeviceContext& ctx){
-
-    std::string rank_table_file = getenv("RANK_TABLE_FILE");
-    int rank_id = atoi(getenv("RANK_ID"));
-    int device_id = atoi(getenv("DEVICE_ID"));
-    int src_rank = atoi(getenv("SRC_RANK"));
-    int dest_rank = atoi(getenv("DEST_RANK"));
-    VLOG(3)<<"rank_id "<< rank_id << "src_rank"<< src_rank <<"dest_rank" <<dest_rank;
-
-    std::vector<int> rank_ids = {0, 1};
-    f::AttributeMap comm_init_attrs;
-    comm_init_attrs["ring_id"] = 0;
-    comm_init_attrs["nranks"] = 2;
-    comm_init_attrs["rank"] = rank_id;
-    comm_init_attrs["device_id"] = device_id;
-    comm_init_attrs["rank_ids"] = rank_ids;
-    auto comm_init_op = f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
-    auto place = ctx.GetPlace();
-    comm_init_op->Run(*scope, place);
-    ctx.Wait();
+void Prepare(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::string rank_table_file = getenv("RANK_TABLE_FILE");
+  int rank_id = atoi(getenv("RANK_ID"));
+  int device_id = atoi(getenv("DEVICE_ID"));
+  int src_rank = atoi(getenv("SRC_RANK"));
+  int dest_rank = atoi(getenv("DEST_RANK"));
+  VLOG(3) << "rank_id " << rank_id << "src_rank" << src_rank << "dest_rank"
+          << dest_rank;
+
+  std::vector<int> rank_ids = {0, 1};
+  f::AttributeMap comm_init_attrs;
+  comm_init_attrs["ring_id"] = 0;
+  comm_init_attrs["nranks"] = 2;
+  comm_init_attrs["rank"] = rank_id;
+  comm_init_attrs["device_id"] = device_id;
+  comm_init_attrs["rank_ids"] = rank_ids;
+  auto comm_init_op =
+      f::OpRegistry::CreateOp("c_comm_init_hcom", {}, {}, comm_init_attrs);
+  auto place = ctx.GetPlace();
+  comm_init_op->Run(*scope, place);
+  ctx.Wait();
 }
 
-void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx){
-    std::cout<< "BEGIN TEST:"<< __FUNCTION__ <<std::endl;
-    auto x = scope->Var("X");
-    auto tensor_x = x->GetMutable<f::LoDTensor>();
-    int num = atoi(getenv("DATA_SIZE"));;
-    EXPECT_GT(num, 0);
-    EXPECT_LT(num, 1 << 15);
-    std::vector<float> init(num*num, 1.0 * atoi(getenv("DEST_RANK")));
-    int rank_id = atoi(getenv("RANK_ID"));
-    VLOG(3)<<"rank id:"<<rank_id;
-    TensorFromVector(init, ctx, tensor_x);
-    tensor_x->Resize({num, num});
-    ctx.Wait();
-    auto place = ctx.GetPlace();
-    ctx.Wait();
-
-    f::AttributeMap attrs;
-    attrs["tag"]=std::string("srtest");
-    attrs["peer"]=atoi(getenv("DEST_RANK"));
-    attrs["ring_id"]=0;
-    attrs["srTag"]=0;
-
-    auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
-    
-    for (int i = 0; i < 10; i ++) {
-      op->Run(*scope, place);
-    }
-    VLOG(3)<<"send run over";
-    ctx.Wait();    
+void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) {
+  std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl;
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  int num = atoi(getenv("DATA_SIZE"));
+
+  EXPECT_GT(num, 0);
+  EXPECT_LT(num, 1 << 15);
+  std::vector<float> init(num * num, 1.0 * atoi(getenv("DEST_RANK")));
+  int rank_id = atoi(getenv("RANK_ID"));
+  VLOG(3) << "rank id:" << rank_id;
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({num, num});
+  ctx.Wait();
+  auto place = ctx.GetPlace();
+  ctx.Wait();
+
+  f::AttributeMap attrs;
+  attrs["tag"] = std::string("srtest");
+  attrs["peer"] = atoi(getenv("DEST_RANK"));
+  attrs["ring_id"] = 0;
+  attrs["srTag"] = 0;
+
+  auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"X"}}}, {}, attrs);
+
+  for (int i = 0; i < 10; i++) {
+    op->Run(*scope, place);
+  }
+  VLOG(3) << "send run over";
+  ctx.Wait();
 }
 
-TEST(send_v2, NPU){
-    f::Scope scope;
-    char * npu_id=getenv("FLAGS_selected_npus");
-    VLOG(3) << "Select npu:" << npu_id;
-    p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id)));
-    VLOG(3) << "Place over";
-    Prepare(&scope, ctx);
-    VLOG(3) << "Prepare over";
-    TestHcomSendOp(&scope, ctx);
-    VLOG(3) << "Test over";
-
+TEST(send_v2, NPU) {
+  f::Scope scope;
+  char* npu_id = getenv("FLAGS_selected_npus");
+  VLOG(3) << "Select npu:" << npu_id;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(atoi(npu_id)));
+  VLOG(3) << "Place over";
+  Prepare(&scope, *ctx);
+  VLOG(3) << "Prepare over";
+  TestHcomSendOp(&scope, *ctx);
+  VLOG(3) << "Test over";
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index e6b5e5f8b7860d29f332e8c691c5cf3400baa42f..f06dbd26873a606ce3a834efa9d1bb0de5814ff7 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -38,7 +38,7 @@ USE_OP(elementwise_sub);
 USE_OP_DEVICE_KERNEL(elementwise_sub, NPU);
 
 template <typename T>
-void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+void Compare(f::Scope *scope, const p::DeviceContext &ctx,
              std::string op_type) {
   // init
   auto x = scope->Var("X");
@@ -90,7 +90,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
 }
 
 template <typename T>
-void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx,
                  std::string op_type) {
   // init
   auto dout = scope->Var("DOut");
@@ -154,30 +154,30 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
 TEST(elementwise_add, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "elementwise_add");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "elementwise_add");
 }
 
 TEST(elementwise_sub, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "elementwise_sub");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub, NPU_fp16) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx, "elementwise_sub");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx, "elementwise_sub");
 }
 
 TEST(elementwise_sub_grad, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "elementwise_sub_grad");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "elementwise_sub_grad");
 }
 
 TEST(elementwise_add_grad, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "elementwise_add_grad");
+  auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "elementwise_add_grad");
 }
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index 95f7865a8a3a4ee22600e4a64c7f2e7bf0fa2a2c..880eb341f2093b1a2bae4aea06b416b42e90d30e 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -69,6 +69,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(expand, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index de067e45585d91ce0efa2269909f9a1052a895ac..31e19d8f600c39427ccf83056faed47e192e8ea5 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -152,18 +152,18 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
 
 TEST(gather, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx, "gather");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "gather");
 }
 
 TEST(gather, NPU_fp16) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<p::float16>(&scope, ctx, "gather");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<p::float16>(&scope, *ctx, "gather");
 }
 
 TEST(gather_grad, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx, "gather_grad");
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx, "gather_grad");
 }
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index d0846e5c90e45fc7149e2013f0fbcea0e0433037..830dcd59839015cf4c32f9caaf0399f209b48a5b 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -59,8 +59,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // run
   auto place = ctx.GetPlace();
 
-  auto op = f::OpRegistry::CreateOp("gelu", {{"X", {"X"}}},
-                                    {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("gelu", {{"X", {"X"}}}, {{"Out", {"Out"}}},
+                                    attrs);
   op->Run(*scope, place);
 
   ctx.Wait();
@@ -76,8 +76,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   gettimeofday(&end, NULL);
-  int micros = (((end.tv_sec - start.tv_sec) * 1000000) +
-                  end.tv_usec) - (start.tv_usec);
+  int micros =
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
   printf("used time: %d\n", micros / 100);
 
   // eval value
@@ -124,8 +124,8 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   auto place = ctx.GetPlace();
 
   auto op = f::OpRegistry::CreateOp("gelu_grad",
-    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}},
-    {{"X@GRAD", {"DX"}}}, attrs);
+                                    {{"Out@GRAD", {"DOut"}}, {"X", {"X"}}},
+                                    {{"X@GRAD", {"DX"}}}, attrs);
   op->Run(*scope, place);
 
   ctx.Wait();
@@ -141,8 +141,8 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   ctx.Wait();
 
   gettimeofday(&end, NULL);
-  int micros = (((end.tv_sec - start.tv_sec) * 1000000) +
-                  end.tv_usec) - (start.tv_usec);
+  int micros =
+      (((end.tv_sec - start.tv_sec) * 1000000) + end.tv_usec) - (start.tv_usec);
   printf("used time: %d\n", micros / 100);
 
   // eval value
@@ -156,14 +156,13 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 }
 
 TEST(gelu, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx);
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
 
 TEST(gelu_grad, NPU) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    CompareGrad<float>(&scope, ctx);
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
 }
-
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index f4ce9ffe40b0dca72e9634f24dd77944cf2aed68..bde349b0a33b9db6c20427319e09bc696b492a3f 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -54,10 +54,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   auto out = scope->Var("Out");
   auto tensor_out = out->GetMutable<f::LoDTensor>();
 
-  f::AttributeMap attr_input = { {"step", static_cast<float>(2.0)} };
+  f::AttributeMap attr_input = {{"step", static_cast<float>(2.0)}};
   auto op = f::OpRegistry::CreateOp("increment", {{"X", {"X"}}},
-                                                 {{"Out", {"Out"}}},
-                                                 attr_input);
+                                    {{"Out", {"Out"}}}, attr_input);
 
   op->Run(*scope, place);
 
@@ -70,16 +69,14 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   EXPECT_EQ(out_vec[0], static_cast<T>(3.0));
 }
 
-
 TEST(increment, NPU_fp32) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "increment");
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx, "increment");
 }
 
 TEST(increment, NPU_fp64) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<float>(&scope, ctx, "increment");
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<double>(&scope, *ctx, "increment");
 }
-
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index f4ec2fe7158f53af460606403ead460639192617..f2f395314c0cc8ce1af237915ef23c7276570a87 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -67,10 +67,9 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   auto tensor_out = out->GetMutable<f::LoDTensor>();
 
   // run
-  auto op = f::OpRegistry::CreateOp(op_type, {{"Start", {"Start"}},
-                                              {"End", {"End"}},
-                                              {"Step", {"Step"}}},
-                                             {{"Out", {"Out"}}}, {});
+  auto op = f::OpRegistry::CreateOp(
+      op_type, {{"Start", {"Start"}}, {"End", {"End"}}, {"Step", {"Step"}}},
+      {{"Out", {"Out"}}}, {});
 
   op->Run(*scope, place);
 
@@ -86,10 +85,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
   EXPECT_EQ(static_cast<T>(out_vec[4]), static_cast<T>(9.0));
 }
 
-
 TEST(range, NPU) {
-    f::Scope scope;
-    p::NPUDeviceContext ctx(p::NPUPlace(0));
-    Compare<int>(&scope, ctx, "range");
+  f::Scope scope;
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<int>(&scope, *ctx, "range");
 }
-
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index d408ff3988f030fcc63140deed52a26ba7e8c986..1eeeb5e1f8aa19dd1de149a8e5225fd68c248f34 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -78,6 +78,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(reduce_any, NPU) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<bool>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<bool>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index 89357705ce0e6c8711567ed031b4fb1640bf7441..d20b3ac04bf95cced0c6a3cf0db8a69d8e166ec9 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -21,11 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
-#include "paddle/fluid/framework/tensor_util.h"
-
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
@@ -59,15 +58,13 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   // run
   int axis = 1;
   f::AttributeMap attrs = {
-            {"axis", axis},
-            {"use_cudnn", false},
-            {"use_mkldnn", false},
-            {"mkldnn_data_type", std::string("float32")},
-            {"is_test", false}, };
+      {"axis", axis},        {"use_cudnn", false},
+      {"use_mkldnn", false}, {"mkldnn_data_type", std::string("float32")},
+      {"is_test", false},
+  };
 
-  auto op =
-      f::OpRegistry::CreateOp("softmax", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("softmax", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
 
   op->Run(*scope, place);
   ctx.Wait();
@@ -76,7 +73,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   TensorToVector(*tensor_out, ctx, &out_vec);
 
   for (int i = 0; i < static_cast<int>(out_vec.size()); ++i) {
-       VLOG(3) << "out_vec[" << i << "] : "<< out_vec[i];
+    VLOG(3) << "out_vec[" << i << "] : " << out_vec[i];
   }
 
   ctx.Wait();
@@ -84,7 +81,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)(6));
 }
 
-
 template <typename T>
 void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   // init
@@ -128,16 +124,15 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
   attrs = {
       {"name", std::string("softmax_grad")},
       {"axis", static_cast<int>(0)},
-            {"use_cudnn", false},
-            {"use_mkldnn", false},
-            {"mkldnn_data_type", std::string("float32")},
-            {"is_test", false},
-            {"data_format", std::string("AnyLayout")}, };
-  auto op =
-      f::OpRegistry::CreateOp("softmax_grad",
-                              {{"Out", {"Out"}},
-                               {"Out@GRAD", {"DOut"}}},
-                              {{"X@GRAD", {"DX"}}}, attrs);
+      {"use_cudnn", false},
+      {"use_mkldnn", false},
+      {"mkldnn_data_type", std::string("float32")},
+      {"is_test", false},
+      {"data_format", std::string("AnyLayout")},
+  };
+  auto op = f::OpRegistry::CreateOp("softmax_grad",
+                                    {{"Out", {"Out"}}, {"Out@GRAD", {"DOut"}}},
+                                    {{"X@GRAD", {"DX"}}}, attrs);
 
   auto place = ctx.GetPlace();
   op->Run(*scope, place);
@@ -164,12 +159,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(softmax, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
 
 TEST(softmax_grad, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 9b0464fc2234bba71645bbe586e30d580eca0c19..1de7ca8c7bdbf44127237afb93d4b4c1bc7f46ab 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -64,9 +64,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   axis.push_back(2);
   f::AttributeMap attrs = {{"axes", axis}};
 
-  auto op =
-      f::OpRegistry::CreateOp("squeeze", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("squeeze", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
 
   op->Run(*scope, place);
   ctx.Wait();
@@ -74,7 +73,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   EXPECT_EQ((uint32_t)tensor_out->dims().size(), uint32_t(2));
   EXPECT_EQ((uint32_t)tensor_out->dims()[0], uint32_t(dim0));
   EXPECT_EQ((uint32_t)tensor_out->dims()[1], uint32_t(dim1));
-  
+
   std::vector<T> out_vec;
   TensorToVector(*tensor_out, ctx, &out_vec);
   for (uint32_t i = 0; i < out_vec.size(); i++) {
@@ -86,7 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(squeeze, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
-
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 36f7a6953585114e03cc11ff03e1d2da7d8bcd0e..f6712814e1e3b83785eb83b3190d6c8e5fcb14ec 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -126,12 +126,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(transpose2, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
 
 TEST(transpose2_grad, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  CompareGrad<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  CompareGrad<float>(&scope, *ctx);
 }
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index 6b53cc328a1dea6e6259bf9c83ffb8971a7b263c..a145c914a8621b14bc7f2e461ba1265c8f40f9d8 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -63,9 +63,8 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
   axis.push_back(1);
   f::AttributeMap attrs = {{"axes", axis}};
 
-  auto op =
-      f::OpRegistry::CreateOp("unsqueeze", {{"X", {"X"}}},
-                              {{"Out", {"Out"}}}, attrs);
+  auto op = f::OpRegistry::CreateOp("unsqueeze", {{"X", {"X"}}},
+                                    {{"Out", {"Out"}}}, attrs);
 
   op->Run(*scope, place);
   ctx.Wait();
@@ -86,7 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) {
 
 TEST(unsqueeze, NPU_fp32) {
   f::Scope scope;
-  p::NPUDeviceContext ctx(p::NPUPlace(0));
-  Compare<float>(&scope, ctx);
+  auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0));
+  Compare<float>(&scope, *ctx);
 }
-
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index e28ace521674bcfba0b2725dc4daa57c66126476..71eee4fe1216c247f732c01d6bd4350a96b6cb98 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -254,7 +254,7 @@ NPUDeviceContext::~NPUDeviceContext() {
 
 void NPUDeviceContext::Wait() const {
   platform::RecordEvent record_event("NPUDeviceContext/wait");
-  VLOG(4) << "NPU context Wait";
+  VLOG(4) << "NPU context(" << this << ")  Wait";
   stream_->Wait();
 }