[NPU] add npu support for new executor. test=develop (#43403)

5988553f · 王明冬 · GitHub · 0a04b8a9 · 5988553f · 5988553f
17 changed file
--- a/paddle/fluid/framework/new_executor/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/data_transfer.cc
@@ -137,6 +137,13 @@ void DataTranferHelper::RunAndConstructOpFuncNode(
  new_op_func_node.output_index["Out"] = {var_scope_->VarId(new_var_name)};
  new_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second);
  new_op_func_node.kernel_func_(exec_ctx);
+  // NOTE(winter-wang): in npu device, D2H kernel is asynchronous. need to
+  // explicit synchronization.
+#ifdef PADDLE_WITH_ASCEND_CL
+  if (op_type == kMemcpyD2H) {
+    dev_ctx->Wait();
+  }
+#endif
  // NOTE(Aurelius84): data_transform_op is expensive operation, so we tag them
  // as kQueueSync and execute them in thread pool.
  new_op_func_node.type_ = OpFuncType::kQueueSync;

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -90,6 +90,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
    local_scope_ = local_scope;
  }
+  var_scope_.SetLocalScope(local_scope_);

  // prune

@@ -115,7 +116,6 @@ InterpreterCore::~InterpreterCore() {
 interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
-  var_scope_.SetLocalScope(local_scope_);
  Prepare(feed_names, feed_tensors, true);
  interpreter::CostInfo cost_info;
  {
@@ -144,7 +144,6 @@ paddle::framework::FetchList InterpreterCore::Run(
  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
  bool is_build = is_build_;
-  var_scope_.SetLocalScope(local_scope_);
  Prepare(feed_names, feed_tensors, is_build);

  if (is_build) {
@@ -153,8 +152,10 @@ paddle::framework::FetchList InterpreterCore::Run(
    // until the second step run.
    async_work_queue_ = GetWorkQueue();
    ExecuteInstructionList(vec_instruction_);
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif
  }
-
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }
@@ -174,7 +175,6 @@ paddle::framework::FetchList InterpreterCore::Run(
  platform::AttachPointerHashToMKLDNNKey(this, place_);
 #endif
  if (!is_build_) {
-    var_scope_.SetLocalScope(local_scope_);
    paddle::framework::interpreter::build_variable_scope(block_, &var_scope_);

    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
@@ -196,12 +196,14 @@ paddle::framework::FetchList InterpreterCore::Run(
    async_work_queue_ = GetWorkQueue();

    ExecuteInstructionList(vec_instruction_);
+#ifdef PADDLE_WITH_ASCEND_CL
+    platform::DeviceContextPool::Instance().Get(place_)->Wait();
+#endif
  }

  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }
-
  // return Fetch Tensors
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
@@ -528,6 +530,17 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);
  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                           : var_scope_.GetMutableScope();
+
+#ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
+  // values, but only through special `float_status` to checks whether
+  // the operation is overflow. More about `float_status`, see:
+  // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
+  if (FLAGS_check_nan_inf) {
+    framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
+  }
+#endif
+
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
  {
    // If it is OperatorBase, InferShape do nothing.

--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -15,6 +15,7 @@

 #include <algorithm>

+#include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/new_executor/data_transfer.h"
 #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h"
@@ -43,6 +44,7 @@ PADDLE_DEFINE_EXPORTED_bool(
    "Enable serial execution for standalone executor, used for debug.");

 DECLARE_bool(use_mkldnn);
+DECLARE_bool(check_nan_inf);

 namespace paddle {
 namespace framework {
@@ -446,11 +448,19 @@ void build_op_func_list(const platform::Place& place,
    op_func_node.output_index = outs_name2id;
    VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);

+#ifdef PADDLE_WITH_ASCEND_CL
+    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
+    // values, but only through special `float_status` to checks whether
+    // the operation is overflow. More about `float_status`, see:
+    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
+    if (FLAGS_check_nan_inf) {
+      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
+    }
+#endif
+
    if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
      // op is not a operatorwithkernel, so direcly run OperatorBase::Run()
      deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope);
-      VLOG(4) << "End run " << place << " "
-              << op_func_node.operator_base_->DebugStringEx(local_scope);
    } else {
      auto op_with_kernel = const_cast<framework::OperatorWithKernel*>(
          static_cast<const framework::OperatorWithKernel*>(op));
@@ -593,6 +603,12 @@ void build_op_func_list(const platform::Place& place,
                  << var_scope->GetNameById(p.second);
        }
      }
+
+      // for debug nan/inf
+      if (FLAGS_check_nan_inf) {
+        VLOG(4) << "Check nan/inf";
+        framework::details::CheckOpHasNanOrInf(*op, *runtime_scope, place);
+      }
    }

    VLOG(4) << "End run " << place << " "
@@ -768,12 +784,7 @@ void ShrinkDownstreamMap(std::map<int, std::list<int>>* downstream_map,
  // b: c

  // happens_before[i][j] means i should be executed before j
-  op_happens_before->resize(op_num);
-  for (size_t i = 0; i < op_num; ++i) {
-    (*op_happens_before)[i].resize(op_num);
-    std::fill(
-        (*op_happens_before)[i].begin(), (*op_happens_before)[i].end(), false);
-  }
+  op_happens_before->assign(op_num, std::vector<bool>(op_num, false));

  // bfs to get all next ops
  auto bfs = [&](size_t op_idx) {
@@ -883,6 +894,18 @@ std::map<int, std::list<int>> build_op_downstream_map(
        }
      }
    }
+    // the original output of inplace op is also change.
+    if (!vec_instruction[op_idx].InplaceBackMap().empty()) {
+      auto& m = vec_instruction[op_idx].InplaceBackMap();
+      for (auto& p : m) {
+        auto& var = p.second;
+        if (var2min_rw_op.count(var)) {
+          for (auto dep_op : var2min_rw_op[var]) {
+            op2dependences[op_idx].insert(dep_op);
+          }
+        }
+      }
+    }

    // step2: update 2 var2xxxx data structure
    for (auto& item :
@@ -894,16 +917,6 @@ std::map<int, std::list<int>> build_op_downstream_map(
      }
    }

-    for (auto& item :
-         vec_instruction[op_idx].Inputs()) {  // for all inputs(read only)
-      for (auto var : item.second) {
-        if (remove_duplicate.count(var) ==
-            0) {  // var in input list and in output list, so remove it.
-          update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var);
-        }
-      }
-    }
-
    // NOTE(zhiqiu): The inplace op with `transfer` also changes
    // original output after that so add original output as well
    // original: a->op->a
@@ -914,8 +927,16 @@ std::map<int, std::list<int>> build_op_downstream_map(
      for (auto& p : m) {
        auto var = p.second;
        var2recent_write_op[var] = op_idx;
-        // var in input list and in output list, so remove it.
-        if (remove_duplicate.count(var) == 0) {
+        var2min_rw_op[var] = {static_cast<int>(op_idx)};
+        remove_duplicate.insert(var);
+      }
+    }
+
+    for (auto& item :
+         vec_instruction[op_idx].Inputs()) {  // for all inputs(read only)
+      for (auto var : item.second) {
+        if (remove_duplicate.count(var) ==
+            0) {  // var in input list and in output list, so remove it.
          update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var);
        }
      }

--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -389,7 +389,8 @@ static bool IsCpuOp(const Instruction& instr) {

 // is supported heterogeneous place
 static bool IsSupportedHetePlace(const phi::Place& place) {
-  return platform::is_gpu_place(place) || platform::is_xpu_place(place);
+  return platform::is_gpu_place(place) || platform::is_npu_place(place) ||
+         platform::is_xpu_place(place);
 }

 }  // namespace interpreter

--- a/paddle/fluid/framework/new_executor/stream_analyzer.cc
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc
@@ -21,23 +21,37 @@

 namespace paddle {
 namespace framework {
+namespace {
+std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>*
+    d2h_ctxs = nullptr;
+std::map<Place, std::shared_future<std::unique_ptr<platform::DeviceContext>>>*
+    h2d_ctxs = nullptr;
+std::mutex ctx_mtx;
+}  // namespace

 StreamAnalyzer::StreamAnalyzer(const platform::Place& place) : place_(place) {
-  if (platform::is_gpu_place(place)) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (platform::is_gpu_place(place) || platform::is_npu_place(place)) {
+    std::lock_guard<std::mutex> lk(ctx_mtx);
+    if (d2h_ctxs == nullptr) {
+      d2h_ctxs = new std::map<
+          Place,
+          std::shared_future<std::unique_ptr<platform::DeviceContext>>>();
+      h2d_ctxs = new std::map<
+          Place,
+          std::shared_future<std::unique_ptr<platform::DeviceContext>>>();
+    }
+    if (d2h_ctxs->find(place) == d2h_ctxs->end()) {
      platform::EmplaceDeviceContexts(
-        &d2h_ctxs_,
+          d2h_ctxs,
          {place},
          /*disable_setting_default_stream_for_allocator=*/true);
      platform::EmplaceDeviceContexts(
-        &h2d_ctxs_,
+          h2d_ctxs,
          {place},
          /*disable_setting_default_stream_for_allocator=*/true);
-#else
-    PADDLE_THROW(
-        platform::errors::Unimplemented("CUDAPlace is not supported. Please "
-                                        "re-compile with WITH_GPU option."));
-#endif
+    }
+    d2h_ctx_ = (*d2h_ctxs)[place];
+    h2d_ctx_ = (*h2d_ctxs)[place];
  }
 }

@@ -162,15 +176,15 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
    const OpFuncNode& op_func_node) {
  auto& op_type = op_func_node.operator_base_->Type();
  auto* dev_ctx = op_func_node.dev_ctx_;
-  // only gpu need update. xpu not need, because xpu memcpy op kernel is
+  // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is
  // synchronous.
-  if (platform::is_gpu_place(place_)) {
+  if (platform::is_gpu_place(place_) || platform::is_npu_place(place_)) {
    if (op_type == interpreter::kMemcpyD2H) {
      VLOG(3) << "Get dev_ctx from d2h_context_pool_";
-      dev_ctx = d2h_ctxs_[place_].get().get();
+      dev_ctx = d2h_ctx_.get().get();
    } else if (op_type == interpreter::kMemcpyH2D) {
      VLOG(3) << "Get dev_ctx from h2d_context_pool_";
-      dev_ctx = h2d_ctxs_[place_].get().get();
+      dev_ctx = h2d_ctx_.get().get();
    }
  }
  return dev_ctx;
@@ -188,11 +202,20 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext(
 */
 bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr,
                                 const Instruction& next_instr) {
-  return platform::is_xpu_place(place_) ||
-         (&cur_instr.DeviceContext() == &next_instr.DeviceContext() ||
-          interpreter::IsCpuOp(cur_instr) ||
+  if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true;
+
+  // xpu memcpy kerenl is synchronous.
+  if (platform::is_xpu_place(place_)) return true;
+
+  // npu d2h kernel is asynchronous.
+  if (platform::is_npu_place(place_)) {
+    return interpreter::IsCpuOp(cur_instr) ||
+           interpreter::IsMemcpyH2D(next_instr);
+  }
+  // gpu or cpu
+  return interpreter::IsCpuOp(cur_instr) ||
         interpreter::IsMemcpyD2H(cur_instr) ||
-          interpreter::IsMemcpyH2D(next_instr));
+         interpreter::IsMemcpyH2D(next_instr);
 }

 platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
@@ -201,6 +224,8 @@ platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) {
  } else {
    if (platform::is_xpu_place(place_)) {
      return platform::kXPU;
+    } else if (platform::is_npu_place(place_)) {
+      return platform::kNPU;
    }
    return platform::kCUDA;
  }

--- a/paddle/fluid/framework/new_executor/stream_analyzer.h
+++ b/paddle/fluid/framework/new_executor/stream_analyzer.h
@@ -53,9 +53,9 @@ class StreamAnalyzer {

  platform::DeviceType GetWaiterType(const Instruction& instr);

-  Place place_;
-  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> d2h_ctxs_;
-  std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>> h2d_ctxs_;
+  const Place place_;
+  std::shared_future<std::unique_ptr<platform::DeviceContext>> d2h_ctx_;
+  std::shared_future<std::unique_ptr<platform::DeviceContext>> h2d_ctx_;
  std::map<size_t, std::shared_ptr<platform::DeviceEvent>> var_id2event_;
 };


--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -1080,11 +1080,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
  } else {
    return m->GetAllocator(p, size)->Allocate(size);
  }
-#elif defined PADDLE_WITH_XPU
+#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL)
  return GetAllocator(place)->Allocate(size);
 #else
-  PADDLE_THROW(
-      platform::errors::PreconditionNotMet("Not compiled with GPU or XPU."));
+  PADDLE_THROW(platform::errors::PreconditionNotMet(
+      "Not compiled with GPU or XPU or NPU."));
 #endif
 }


--- a/paddle/fluid/operators/crop_op_npu.cc
+++ b/paddle/fluid/operators/crop_op_npu.cc
@@ -70,8 +70,12 @@ class CropNPUKernel : public framework::OpKernel<T> {
                            shape->dims().size(),
                            x->dims().size()));

+      // shape memory maybe have gc.
+      Tensor tmp_shape(*shape);
+      tmp_shape.mutable_data<T>(ctx.GetPlace());
+
      const auto& runner =
-          NpuOpRunner("Crop", {*x, *shape}, {*out}, attr_input);
+          NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input);
      auto stream =
          ctx.template device_context<paddle::platform::NPUDeviceContext>()
              .stream();

--- a/paddle/fluid/operators/memcpy_h2d_op.cc
+++ b/paddle/fluid/operators/memcpy_h2d_op.cc
@@ -94,14 +94,13 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out",
              "(LoDTensor) The type of output "
              "is the same as input X.");
-    AddAttr<int>(
-        "dst_place_type",
+    AddAttr<int>("dst_place_type",
                 "Determine the dst place of tensor copy. "
-        "By Now it ONLY support CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace "
-        "Other place type is Unimplemented and will cause ERROR."
-        "0: dst is on CUDAPlace. "
-        "1: dst is on NPUPlace. "
-        "2: dst is on XPUPlace. ");
+                 "By Now it support:"
+                 "0. CUDAPinnedPlace/CPU <->CUDAPlace"
+                 "1. NPUPinnedPlace/CPU <-> NPUPlace"
+                 "2. CPU <->XPUPlace"
+                 "Other place type is Unimplemented and will cause ERROR.");
    AddComment(R"DOC(
    MemcpyD2H Operator.
    By now, it ONLY supports the memcopy between CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace.

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -280,6 +280,16 @@ if(WITH_XPU)
      CACHE INTERNAL "device event libs")
 endif()

+if(WITH_ASCEND_CL)
+  cc_library(
+    device_event_npu
+    SRCS device_event_npu.cc
+    DEPS device_event_base npu_resource_pool)
+  set(DEVICE_EVENT_LIBS
+      device_event_npu
+      CACHE INTERNAL "device event libs")
+endif()
+
 if(WITH_GPU)
  nv_library(
    device_event_gpu

--- a/paddle/fluid/platform/device/npu/npu_info.cc
+++ b/paddle/fluid/platform/device/npu/npu_info.cc
@@ -285,6 +285,10 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status) {
  PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, status));
 }

+void NPUEventSynchronize(aclrtEvent event) {
+  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeEvent(event));
+}
+
 void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event) {
  PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream, event));
 }

--- a/paddle/fluid/platform/device/npu/npu_info.h
+++ b/paddle/fluid/platform/device/npu/npu_info.h
@@ -138,6 +138,9 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status);
 //! Record NPU event in the stream.
 void NPUEventRecord(aclrtEvent event, aclrtStream stream);

+//! Synchronize NPU event.
+void NPUEventSynchronize(aclrtEvent event);
+
 //! Makes a stream wait on an event.
 void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event);


--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -125,6 +125,8 @@ DeviceType Place2DeviceType(const platform::Place& place) {
    return platform::DeviceType::XPU;
  } else if (platform::is_ipu_place(place)) {
    return platform::DeviceType::IPU;
+  } else if (platform::is_npu_place(place)) {
+    return platform::DeviceType::NPU;
  } else if (platform::is_mlu_place(place)) {
    return platform::DeviceType::MLU;
  } else {

--- a/paddle/fluid/platform/device_event.h
+++ b/paddle/fluid/platform/device_event.h
@@ -25,6 +25,7 @@

 using ::paddle::platform::kCPU;
 using ::paddle::platform::kCUDA;
+using ::paddle::platform::kNPU;
 using ::paddle::platform::kXPU;

 USE_EVENT(kCPU)
@@ -41,3 +42,9 @@ USE_EVENT(kXPU);
 USE_EVENT_WAIT(kXPU, kXPU)
 USE_EVENT_WAIT(kCPU, kXPU)
 #endif
+
+#ifdef PADDLE_WITH_ASCEND_CL
+USE_EVENT(kNPU);
+USE_EVENT_WAIT(kNPU, kNPU)
+USE_EVENT_WAIT(kCPU, kNPU)
+#endif
--- a/paddle/fluid/platform/device_event_base.h
+++ b/paddle/fluid/platform/device_event_base.h
@@ -66,7 +66,7 @@ class DeviceEvent {
                          type_id_));
    // TODO(Aurelius84): only support CPU/CUDA, need consider XPU/NPU later
    PADDLE_ENFORCE_LT(type_id_,
-                      3,
+                      4,
                      platform::errors::Unavailable(
                          "Currently DeviceEvent do not support %s", place));
    PADDLE_ENFORCE_NOT_NULL(

--- a/paddle/fluid/platform/device_event_npu.cc
+++ b/paddle/fluid/platform/device_event_npu.cc
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_ASCEND_CL
+
+#include "paddle/fluid/platform/device/npu/npu_resource_pool.h"
+#include "paddle/fluid/platform/device_event_base.h"
+#include "paddle/fluid/platform/event.h"
+namespace paddle {
+namespace platform {
+struct NPUDeviceEventWrapper {
+  explicit NPUDeviceEventWrapper(const platform::Place& place) {
+    PADDLE_ENFORCE_EQ(
+        platform::is_npu_place(place),
+        true,
+        platform::errors::PreconditionNotMet(
+            "Required device shall be NPUPlace, but received %d. ", place));
+
+    device_id_ = place.device;
+    PADDLE_ENFORCE_GT(
+        device_id_,
+        -1,
+        platform::errors::PreconditionNotMet(
+            "Required DeviceOption.device_id > -1, but received %d. ",
+            device_id_));
+    inner_event_ = NpuEventResourcePool::Instance().New(device_id_);
+  }
+  std::shared_ptr<NpuEventObject> inner_event_;
+  int device_id_;
+};
+
+void DeviceEventCreateNPU(DeviceEvent* event,
+                          const platform::Place& place,
+                          unsigned int) {
+  event->InitEvent(std::make_shared<NPUDeviceEventWrapper>(place));
+}
+
+void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
+  PADDLE_ENFORCE_NOT_NULL(
+      npu_dev_ctx,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast context into NPUDeviceContext."));
+  NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream());
+}
+
+bool DeviceEventQueryNPU(const DeviceEvent* event) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  PADDLE_ENFORCE_NOT_NULL(
+      wrapper,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast event into NPUDeviceEventWrapper."));
+  aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE;
+  platform::NPUEventQuery(wrapper->inner_event_.get(), &status);
+  return ACL_EVENT_STATUS_COMPLETE == status;
+}
+
+void DeviceEventFinishNPU(const DeviceEvent* event) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  NPUEventSynchronize(wrapper->inner_event_.get());
+}
+
+void DeviceEventNPUWaitNPU(const DeviceEvent* event,
+                           const DeviceContext* context) {
+  auto* wrapper = static_cast<NPUDeviceEventWrapper*>(event->GetEvent().get());
+  auto* npu_dev_ctx = dynamic_cast<const platform::NPUDeviceContext*>(context);
+  PADDLE_ENFORCE_NOT_NULL(
+      npu_dev_ctx,
+      platform::errors::PreconditionNotMet(
+          "Failed to dynamic_cast context into NPUDeviceContext."));
+  NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get());
+}
+
+void DeviceEventCPUWaitNPU(const DeviceEvent* event,
+                           const DeviceContext* context) {
+  DeviceEventFinishNPU(event);
+}
+
+void DeviceEventSetFinishedNPU(const DeviceEvent* event) {
+  // do nothing
+}
+
+void EventResetNPU(const DeviceEvent* event) {
+  // do nothing
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+using ::paddle::platform::kCPU;
+using ::paddle::platform::kNPU;
+REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU)
+REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU)
+REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU)
+REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU)
+REGISTER_EVENT_SET_FINISHED_FUNCTION(
+    kNPU, paddle::platform::DeviceEventSetFinishedNPU)
+REGISTER_EVENT_WAIT_FUNCTION(kNPU,
+                             kNPU,
+                             paddle::platform::DeviceEventNPUWaitNPU)
+REGISTER_EVENT_WAIT_FUNCTION(kCPU,
+                             kNPU,
+                             paddle::platform::DeviceEventCPUWaitNPU)
+REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU)
+#endif
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1400,9 +1400,8 @@ class Executor(object):
            program = pruned_program

        def _can_use_interpreter_core(program, place):
-            if core.is_compiled_with_npu() or core.is_compiled_with_mlu(
-            ) or core.is_compiled_with_ipu() or isinstance(
-                    place, core.CustomPlace):
+            if core.is_compiled_with_mlu() or core.is_compiled_with_ipu(
+            ) or isinstance(place, core.CustomPlace):
                return False

            compiled = isinstance(program, compiler.CompiledProgram)