diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc
index 701f0a430aa5c24f6113fd43bc5015f40d1f2dce..b856bbec4b0c47f387487a79388013ed91b1fc32 100644
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -137,6 +137,13 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
   new_op_func_node.output_index["Out"] = {var_scope_->VarId(new_var_name)};
   new_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
   new_op_func_node.kernel_func_(exec_ctx);
+  // NOTE(winter-wang): in npu device, D2H kernel is asynchronous. need to
+  // explicit synchronization.
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (op_type == kMemcpyD2H) {
+    dev_ctx->Wait();
+  }
+#endif
   // NOTE(Aurelius84): data_transform_op is expensive operation, so we tag them
   // as kQueueSync and execute them in thread pool.
   new_op_func_node.type_ = OpFuncType::kQueueSync;
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 3c66eb0c4613cd2e8cf85ca611e3ca5348db91e2..c321069537c8974af6a231a6e46fe3e8f0dc16d9 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -90,6 +90,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
     auto local_scope = &var_scope_.GetMutableScope()->NewScope();
     local_scope_ = local_scope;
   }
+  var_scope_.SetLocalScope(local_scope_);
 
   // prune
 
@@ -115,7 +116,6 @@ InterpreterCore::~InterpreterCore() {
 interpreter::CostInfo InterpreterCore::DryRun(
     const std::vector<std::string>& feed_names,
     const std::vector<framework::LoDTensor>& feed_tensors) {
-  var_scope_.SetLocalScope(local_scope_);
   Prepare(feed_names, feed_tensors, true);
   interpreter::CostInfo cost_info;
   {
@@ -144,7 +144,6 @@ paddle::framework::FetchList InterpreterCore::Run(
   platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
   bool is_build = is_build_;
-  var_scope_.SetLocalScope(local_scope_);
   Prepare(feed_names, feed_tensors, is_build);
 
   if (is_build) {
@@ -153,8 +152,10 @@ paddle::framework::FetchList InterpreterCore::Run(
     // until the second step run.
     async_work_queue_ = GetWorkQueue();
     ExecuteInstructionList(vec_instruction_);
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif
   }
-
   if (create_local_scope_) {
     ClearLoDTensorArrayInLocalScope();
   }
@@ -174,7 +175,6 @@ paddle::framework::FetchList InterpreterCore::Run(
   platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
   if (!is_build_) {
-    var_scope_.SetLocalScope(local_scope_);
     paddle::framework::interpreter::build_variable_scope(block_, &var_scope_);
 
     std::vector<paddle::framework::OpFuncNode> op_func_nodes;
@@ -196,12 +196,14 @@ paddle::framework::FetchList InterpreterCore::Run(
     async_work_queue_ = GetWorkQueue();
 
     ExecuteInstructionList(vec_instruction_);
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif
   }
 
   if (create_local_scope_) {
     ClearLoDTensorArrayInLocalScope();
   }
-
   // return Fetch Tensors
   auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
   if (fetch_var) {
@@ -528,6 +530,17 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);
   Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                            : var_scope_.GetMutableScope();
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
+  // values, but only through special `float_status` to checks whether
+  // the operation is overflow. More about `float_status`, see:
+  // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
+  if (FLAGS_check_nan_inf) {
+    framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
+  }
+#endif
+
   auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
     // If it is OperatorBase, InferShape do nothing.
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 1a539c1ce1cea0350e9a895e4d25a606e081ee4a..acbcf1da4c5e3e4fddf1e5aad074f3e4d2ca8fdf 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -15,6 +15,7 @@
 
 #include <algorithm>
 
+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/new_executor/data_transfer.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
@@ -43,6 +44,7 @@ PADDLE_DEFINE_EXPORTED_bool(
     "Enable serial execution for standalone executor, used for debug.");
 
 DECLARE_bool(use_mkldnn);
+DECLARE_bool(check_nan_inf);
 
 namespace paddle {
 namespace framework {
@@ -446,11 +448,19 @@ void build_op_func_list(const platform::Place& place,
     op_func_node.output_index = outs_name2id;
     VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
 
+#ifdef PADDLE_WITH_ASCEND_CL
+    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
+    // values, but only through special `float_status` to checks whether
+    // the operation is overflow. More about `float_status`, see:
+    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
+    if (FLAGS_check_nan_inf) {
+      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
+    }
+#endif
+
     if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
       // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
       deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope);
-      VLOG(4) << "End run " << place << " "
-              << op_func_node.operator_base_->DebugStringEx(local_scope);
     } else {
       auto op_with_kernel = const_cast<framework::OperatorWithKernel*>(
           static_cast<const framework::OperatorWithKernel*>(op));
@@ -593,6 +603,12 @@ void build_op_func_list(const platform::Place& place,
                   << var_scope->GetNameById(p.second);
         }
       }
+
+      // for debug nan/inf
+      if (FLAGS_check_nan_inf) {
+        VLOG(4) << "Check nan/inf";
+        framework::details::CheckOpHasNanOrInf(*op, *runtime_scope, place);
+      }
     }
 
     VLOG(4) << "End run " << place << " "
@@ -768,12 +784,7 @@ void ShrinkDownstreamMap(std::map<int, std::list<int>>* downstream_map,
   // b: c
 
   // happens_before[i][j] means i should be executed before j
-  op_happens_before->resize(op_num);
-  for (size_t i = 0; i < op_num; ++i) {
-    (*op_happens_before)[i].resize(op_num);
-    std::fill(
-        (*op_happens_before)[i].begin(), (*op_happens_before)[i].end(), false);
-  }
+  op_happens_before->assign(op_num, std::vector<bool>(op_num, false));
 
   // bfs to get all next ops
   auto bfs = [&](size_t op_idx) {
@@ -883,6 +894,18 @@ std::map<int, std::list<int>> build_op_downstream_map(
         }
       }
     }
+    // the original output of inplace op is also change.
+    if (!vec_instruction[op_idx].InplaceBackMap().empty()) {
+      auto& m = vec_instruction[op_idx].InplaceBackMap();
+      for (auto& p : m) {
+        auto& var = p.second;
+        if (var2min_rw_op.count(var)) {
+          for (auto dep_op : var2min_rw_op[var]) {
+            op2dependences[op_idx].insert(dep_op);
+          }
+        }
+      }
+    }
 
     // step2: update 2 var2xxxx data structure
     for (auto& item :
@@ -894,16 +917,6 @@ std::map<int, std::list<int>> build_op_downstream_map(
       }
     }
 
-    for (auto& item :
-         vec_instruction[op_idx].Inputs()) {  // for all inputs(read only)
-      for (auto var : item.second) {
-        if (remove_duplicate.count(var) ==
-            0) {  // var in input list and in output list, so remove it.
-          update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var);
-        }
-      }
-    }
-
     // NOTE(zhiqiu): The inplace op with `transfer` also changes
     // original output after that so add original output as well
     // original: a->op->a
@@ -914,8 +927,16 @@ std::map<int, std::list<int>> build_op_downstream_map(
       for (auto& p : m) {
         auto var = p.second;
         var2recent_write_op[var] = op_idx;
-        // var in input list and in output list, so remove it.
-        if (remove_duplicate.count(var) == 0) {
+        var2min_rw_op[var] = {static_cast<int>(op_idx)};
+        remove_duplicate.insert(var);
+      }
+    }
+
+    for (auto& item :
+         vec_instruction[op_idx].Inputs()) {  // for all inputs(read only)
+      for (auto var : item.second) {
+        if (remove_duplicate.count(var) ==
+            0) {  // var in input list and in output list, so remove it.
           update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var);
         }
       }
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 70a92f0ae28aed3240fbeee62c5fdc7133dcbcb3..af3951f4538f12f035fdc0e5944c75ff33fb63f8 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -389,7 +389,8 @@ static bool IsCpuOp(const Instruction& instr) {
 
 // is supported heterogeneous place
 static bool IsSupportedHetePlace(const phi::Place& place) {
-  return platform::is_gpu_place(place) || platform::is_xpu_place(place);
+  return platform::is_gpu_place(place) || platform::is_npu_place(place) ||
+         platform::is_xpu_place(place);
 }
 
 }  // namespace interpreter
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc
index b7a7e4c0b546ff76b813e6c2465e57bed6bca632..086dac8dac1fbf2ce82cc31089ceb57933b4415e 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -21,23 +21,37 @@
 
 namespace paddle {
 namespace framework {
+namespace {
+std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>*
+    d2h_ctxs = nullptr;
+std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>*
+    h2d_ctxs = nullptr;
+std::mutex ctx_mtx;
+}  // namespace
 
 StreamAnalyzer::StreamAnalyzer(const platform::Place& place) : place_(place) {
-  if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    platform::EmplaceDeviceContexts(
-        &d2h_ctxs_,
-        {place},
-        /*disable_setting_default_stream_for_allocator=*/true);
-    platform::EmplaceDeviceContexts(
-        &h2d_ctxs_,
-        {place},
-        /*disable_setting_default_stream_for_allocator=*/true);
-#else
-    PADDLE_THROW(
-        platform::errors::Unimplemented("CUDAPlace is not supported. Please "
-                                        "re-compile with WITH_GPU option."));
-#endif
+  if (platform::is_gpu_place(place) || platform::is_npu_place(place)) {
+    std::lock_guard<std::mutex> lk(ctx_mtx);
+    if (d2h_ctxs == nullptr) {
+      d2h_ctxs = new std::map<
+          Place,
+          std::shared_future<std::unique_ptr<platform::DeviceContext>>>();
+      h2d_ctxs = new std::map<
+          Place,
+          std::shared_future<std::unique_ptr<platform::DeviceContext>>>();
+    }
+    if (d2h_ctxs->find(place) == d2h_ctxs->end()) {
+      platform::EmplaceDeviceContexts(
+          d2h_ctxs,
+          {place},
+          /*disable_setting_default_stream_for_allocator=*/true);
+      platform::EmplaceDeviceContexts(
+          h2d_ctxs,
+          {place},
+          /*disable_setting_default_stream_for_allocator=*/true);
+    }
+    d2h_ctx_ = (*d2h_ctxs)[place];
+    h2d_ctx_ = (*h2d_ctxs)[place];
   }
 }
 
@@ -162,15 +176,15 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
     const OpFuncNode& op_func_node) {
   auto& op_type = op_func_node.operator_base_->Type();
   auto* dev_ctx = op_func_node.dev_ctx_;
-  // only gpu need update. xpu not need, because xpu memcpy op kernel is
+  // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is
   // synchronous.
-  if (platform::is_gpu_place(place_)) {
+  if (platform::is_gpu_place(place_) || platform::is_npu_place(place_)) {
     if (op_type == interpreter::kMemcpyD2H) {
       VLOG(3) << "Get dev_ctx from d2h_context_pool_";
-      dev_ctx = d2h_ctxs_[place_].get().get();
+      dev_ctx = d2h_ctx_.get().get();
     } else if (op_type == interpreter::kMemcpyH2D) {
       VLOG(3) << "Get dev_ctx from h2d_context_pool_";
-      dev_ctx = h2d_ctxs_[place_].get().get();
+      dev_ctx = h2d_ctx_.get().get();
     }
   }
   return dev_ctx;
@@ -188,11 +202,20 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
  */
 bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr,
                                  const Instruction& next_instr) {
-  return platform::is_xpu_place(place_) ||
-         (&cur_instr.DeviceContext() == &next_instr.DeviceContext() ||
-          interpreter::IsCpuOp(cur_instr) ||
-          interpreter::IsMemcpyD2H(cur_instr) ||
-          interpreter::IsMemcpyH2D(next_instr));
+  if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true;
+
+  // xpu memcpy kerenl is synchronous.
+  if (platform::is_xpu_place(place_)) return true;
+
+  // npu d2h kernel is asynchronous.
+  if (platform::is_npu_place(place_)) {
+    return interpreter::IsCpuOp(cur_instr) ||
+           interpreter::IsMemcpyH2D(next_instr);
+  }
+  // gpu or cpu
+  return interpreter::IsCpuOp(cur_instr) ||
+         interpreter::IsMemcpyD2H(cur_instr) ||
+         interpreter::IsMemcpyH2D(next_instr);
 }
 
 platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
@@ -201,6 +224,8 @@ platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
   } else {
     if (platform::is_xpu_place(place_)) {
       return platform::kXPU;
+    } else if (platform::is_npu_place(place_)) {
+      return platform::kNPU;
     }
     return platform::kCUDA;
   }
diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h
index 61e37bbb686fcd3f111680d0ed77b41ad12ee8cd..4be8ffe6bb4caeb91d91214aad630f7b1abfee6d 100644
--- a/paddle/fluid/framework/new_executor/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.h
@@ -53,9 +53,9 @@ class StreamAnalyzer {
 
   platform::DeviceType GetWaiterType(const Instruction& instr);
 
-  Place place_;
-  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> d2h_ctxs_;
-  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> h2d_ctxs_;
+  const Place place_;
+  std::shared_future<std::unique_ptr<platform::DeviceContext>> d2h_ctx_;
+  std::shared_future<std::unique_ptr<platform::DeviceContext>> h2d_ctx_;
   std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;
 };
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 4364934a4027d9022e5939fc8db0a52ce7d3d5d8..917cebc11f9a904191591f7eac60cfce0ea531d7 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -1080,11 +1080,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
   } else {
     return m->GetAllocator(p, size)->Allocate(size);
   }
-#elif defined PADDLE_WITH_XPU
+#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
   return GetAllocator(place)->Allocate(size);
 #else
-  PADDLE_THROW(
-      platform::errors::PreconditionNotMet("Not compiled with GPU or XPU."));
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "Not compiled with GPU or XPU or NPU."));
 #endif
 }
 
diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc
index 6c4c6eb25d8204d9429a1da0617458e5cd9481ab..bd50dea15f80e9b58321b825cf936f778f6a1a43 100644
--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -70,8 +70,12 @@ class CropNPUKernel : public framework::OpKernel<T> {
                             shape->dims().size(),
                             x->dims().size()));
 
+      // shape memory maybe have gc.
+      Tensor tmp_shape(*shape);
+      tmp_shape.mutable_data<T>(ctx.GetPlace());
+
       const auto& runner =
-          NpuOpRunner("Crop", {*x, *shape}, {*out}, attr_input);
+          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
       auto stream =
           ctx.template device_context<paddle::platform::NPUDeviceContext>()
               .stream();
diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc
index 411841c4502fa4482655853d45588e86c8e38c97..98ed68cf84f87dc5cdda4318549d6d353679502e 100644
--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -94,14 +94,13 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out",
               "(LoDTensor) The type of output "
               "is the same as input X.");
-    AddAttr<int>(
-        "dst_place_type",
-        "Determine the dst place of tensor copy. "
-        "By Now it ONLY support CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace "
-        "Other place type is Unimplemented and will cause ERROR."
-        "0: dst is on CUDAPlace. "
-        "1: dst is on NPUPlace. "
-        "2: dst is on XPUPlace. ");
+    AddAttr<int>("dst_place_type",
+                 "Determine the dst place of tensor copy. "
+                 "By Now it support:"
+                 "0. CUDAPinnedPlace/CPU <->CUDAPlace"
+                 "1. NPUPinnedPlace/CPU <-> NPUPlace"
+                 "2. CPU <->XPUPlace"
+                 "Other place type is Unimplemented and will cause ERROR.");
     AddComment(R"DOC(
     MemcpyD2H Operator.
     By now, it ONLY supports the memcopy between CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace.
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index efe0479871215a593306f1edb2b5f2d987ffd74d..b00e4056259d93face5ab11304388c18d4956fe8 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -280,6 +280,16 @@ if(WITH_XPU)
       CACHE INTERNAL "device event libs")
 endif()
 
+if(WITH_ASCEND_CL)
+  cc_library(
+    device_event_npu
+    SRCS device_event_npu.cc
+    DEPS device_event_base npu_resource_pool)
+  set(DEVICE_EVENT_LIBS
+      device_event_npu
+      CACHE INTERNAL "device event libs")
+endif()
+
 if(WITH_GPU)
   nv_library(
     device_event_gpu
diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc
index 362c4e8fae8b1368245bfe6f95d7e0c1adc44e2c..9acdef985ade20004d88ed9a1ea2d6b25527592d 100644
--- a/paddle/fluid/platform/device/npu/npu_info.cc
+++ b/paddle/fluid/platform/device/npu/npu_info.cc
@@ -285,6 +285,10 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status) {
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, status));
 }
 
+void NPUEventSynchronize(aclrtEvent event) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeEvent(event));
+}
+
 void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event) {
   PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream, event));
 }
diff --git a/paddle/fluid/platform/device/npu/npu_info.h b/paddle/fluid/platform/device/npu/npu_info.h
index f7af1c246ef6c2afc1c1caf0013796cac6ea3089..ea55831db2e225ae2b1accf8ce589deff47f1e8d 100644
--- a/paddle/fluid/platform/device/npu/npu_info.h
+++ b/paddle/fluid/platform/device/npu/npu_info.h
@@ -138,6 +138,9 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status);
 //! Record NPU event in the stream.
 void NPUEventRecord(aclrtEvent event, aclrtStream stream);
 
+//! Synchronize NPU event.
+void NPUEventSynchronize(aclrtEvent event);
+
 //! Makes a stream wait on an event.
 void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event);
 
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index a668d7f4b8366d8240f3974275e3afe28bc3f242..6bceb696c0f8e18c36472be34eb8efb2a471cd85 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -125,6 +125,8 @@ DeviceType Place2DeviceType(const platform::Place& place) {
     return platform::DeviceType::XPU;
   } else if (platform::is_ipu_place(place)) {
     return platform::DeviceType::IPU;
+  } else if (platform::is_npu_place(place)) {
+    return platform::DeviceType::NPU;
   } else if (platform::is_mlu_place(place)) {
     return platform::DeviceType::MLU;
   } else {
diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h
index 1fd116600624ca58c469f9460644ebef76a94923..2edccfa90c9395236c74ce0292664a3d581e2ba1 100644
--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -25,6 +25,7 @@
 
 using ::paddle::platform::kCPU;
 using ::paddle::platform::kCUDA;
+using ::paddle::platform::kNPU;
 using ::paddle::platform::kXPU;
 
 USE_EVENT(kCPU)
@@ -41,3 +42,9 @@ USE_EVENT(kXPU);
 USE_EVENT_WAIT(kXPU, kXPU)
 USE_EVENT_WAIT(kCPU, kXPU)
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+USE_EVENT(kNPU);
+USE_EVENT_WAIT(kNPU, kNPU)
+USE_EVENT_WAIT(kCPU, kNPU)
+#endif
diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h
index b42721a60d974a9f353adc0c4b56c817f24f8fbf..51df0fd4f40adc42780333cc7fd90bb39634ac6e 100644
--- a/paddle/fluid/platform/device_event_base.h
+++ b/paddle/fluid/platform/device_event_base.h
@@ -66,7 +66,7 @@ class DeviceEvent {
                           type_id_));
     // TODO(Aurelius84): only support CPU/CUDA, need consider XPU/NPU later
     PADDLE_ENFORCE_LT(type_id_,
-                      3,
+                      4,
                       platform::errors::Unavailable(
                           "Currently DeviceEvent do not support %s", place));
     PADDLE_ENFORCE_NOT_NULL(
diff --git a/paddle/fluid/platform/device_event_npu.cc b/paddle/fluid/platform/device_event_npu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..215f308f663489477666f563435b2ca508f04117
--- /dev/null
+++ b/paddle/fluid/platform/device_event_npu.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
+#include "paddle/fluid/platform/device_event_base.h"
+#include "paddle/fluid/platform/event.h"
+namespace paddle {
+namespace platform {
+struct NPUDeviceEventWrapper {
+  explicit NPUDeviceEventWrapper(const platform::Place& place) {
+    PADDLE_ENFORCE_EQ(
+        platform::is_npu_place(place),
+        true,
+        platform::errors::PreconditionNotMet(
+            "Required device shall be NPUPlace, but received %d. ", place));
+
+    device_id_ = place.device;
+    PADDLE_ENFORCE_GT(
+        device_id_,
+        -1,
+        platform::errors::PreconditionNotMet(
+            "Required DeviceOption.device_id > -1, but received %d. ",
+            device_id_));
+    inner_event_ = NpuEventResourcePool::Instance().New(device_id_);
+  }
+  std::shared_ptr<NpuEventObject> inner_event_;
+  int device_id_;
+};
+
+void DeviceEventCreateNPU(DeviceEvent* event,
+                          const platform::Place& place,
+                          unsigned int) {
+  event->InitEvent(std::make_shared<NPUDeviceEventWrapper>(place));
+}
+
+void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
+  PADDLE_ENFORCE_NOT_NULL(
+      npu_dev_ctx,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast context into NPUDeviceContext."));
+  NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream());
+}
+
+bool DeviceEventQueryNPU(const DeviceEvent* event) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  PADDLE_ENFORCE_NOT_NULL(
+      wrapper,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast event into NPUDeviceEventWrapper."));
+  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+  platform::NPUEventQuery(wrapper->inner_event_.get(), &status);
+  return ACL_EVENT_STATUS_COMPLETE == status;
+}
+
+void DeviceEventFinishNPU(const DeviceEvent* event) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  NPUEventSynchronize(wrapper->inner_event_.get());
+}
+
+void DeviceEventNPUWaitNPU(const DeviceEvent* event,
+                           const DeviceContext* context) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
+  PADDLE_ENFORCE_NOT_NULL(
+      npu_dev_ctx,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast context into NPUDeviceContext."));
+  NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get());
+}
+
+void DeviceEventCPUWaitNPU(const DeviceEvent* event,
+                           const DeviceContext* context) {
+  DeviceEventFinishNPU(event);
+}
+
+void DeviceEventSetFinishedNPU(const DeviceEvent* event) {
+  // do nothing
+}
+
+void EventResetNPU(const DeviceEvent* event) {
+  // do nothing
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+using ::paddle::platform::kCPU;
+using ::paddle::platform::kNPU;
+REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU)
+REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU)
+REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU)
+REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU)
+REGISTER_EVENT_SET_FINISHED_FUNCTION(
+    kNPU, paddle::platform::DeviceEventSetFinishedNPU)
+REGISTER_EVENT_WAIT_FUNCTION(kNPU,
+                             kNPU,
+                             paddle::platform::DeviceEventNPUWaitNPU)
+REGISTER_EVENT_WAIT_FUNCTION(kCPU,
+                             kNPU,
+                             paddle::platform::DeviceEventCPUWaitNPU)
+REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU)
+#endif
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 3303b6c9472ff9bcac0ab8da822c1cdfcd0635b5..5f80e3b7577707c68e4439db309edc835afade4f 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1400,9 +1400,8 @@ class Executor(object):
             program = pruned_program
 
         def _can_use_interpreter_core(program, place):
-            if core.is_compiled_with_npu() or core.is_compiled_with_mlu(
-            ) or core.is_compiled_with_ipu() or isinstance(
-                    place, core.CustomPlace):
+            if core.is_compiled_with_mlu() or core.is_compiled_with_ipu(
+            ) or isinstance(place, core.CustomPlace):
                 return False
 
             compiled = isinstance(program, compiler.CompiledProgram)