diff --git a/paddle/fluid/framework/new_executor/data_transfer.cc b/paddle/fluid/framework/new_executor/data_transfer.cc index 701f0a430aa5c24f6113fd43bc5015f40d1f2dce..b856bbec4b0c47f387487a79388013ed91b1fc32 100644 --- a/paddle/fluid/framework/new_executor/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/data_transfer.cc @@ -137,6 +137,13 @@ void DataTranferHelper::RunAndConstructOpFuncNode( new_op_func_node.output_index["Out"] = {var_scope_->VarId(new_var_name)}; new_op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second); new_op_func_node.kernel_func_(exec_ctx); + // NOTE(winter-wang): in npu device, D2H kernel is asynchronous. need to + // explicit synchronization. +#ifdef PADDLE_WITH_ASCEND_CL + if (op_type == kMemcpyD2H) { + dev_ctx->Wait(); + } +#endif // NOTE(Aurelius84): data_transform_op is expensive operation, so we tag them // as kQueueSync and execute them in thread pool. new_op_func_node.type_ = OpFuncType::kQueueSync; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 3c66eb0c4613cd2e8cf85ca611e3ca5348db91e2..c321069537c8974af6a231a6e46fe3e8f0dc16d9 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -90,6 +90,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place, auto local_scope = &var_scope_.GetMutableScope()->NewScope(); local_scope_ = local_scope; } + var_scope_.SetLocalScope(local_scope_); // prune @@ -115,7 +116,6 @@ InterpreterCore::~InterpreterCore() { interpreter::CostInfo InterpreterCore::DryRun( const std::vector& feed_names, const std::vector& feed_tensors) { - var_scope_.SetLocalScope(local_scope_); Prepare(feed_names, feed_tensors, true); interpreter::CostInfo cost_info; { @@ -144,7 +144,6 @@ paddle::framework::FetchList InterpreterCore::Run( platform::AttachPointerHashToMKLDNNKey(this, place_); #endif bool is_build = is_build_; - var_scope_.SetLocalScope(local_scope_); Prepare(feed_names, feed_tensors, is_build); if (is_build) { @@ -153,8 +152,10 @@ paddle::framework::FetchList InterpreterCore::Run( // until the second step run. async_work_queue_ = GetWorkQueue(); ExecuteInstructionList(vec_instruction_); +#ifdef PADDLE_WITH_ASCEND_CL + platform::DeviceContextPool::Instance().Get(place_)->Wait(); +#endif } - if (create_local_scope_) { ClearLoDTensorArrayInLocalScope(); } @@ -174,7 +175,6 @@ paddle::framework::FetchList InterpreterCore::Run( platform::AttachPointerHashToMKLDNNKey(this, place_); #endif if (!is_build_) { - var_scope_.SetLocalScope(local_scope_); paddle::framework::interpreter::build_variable_scope(block_, &var_scope_); std::vector op_func_nodes; @@ -196,12 +196,14 @@ paddle::framework::FetchList InterpreterCore::Run( async_work_queue_ = GetWorkQueue(); ExecuteInstructionList(vec_instruction_); +#ifdef PADDLE_WITH_ASCEND_CL + platform::DeviceContextPool::Instance().Get(place_)->Wait(); +#endif } if (create_local_scope_) { ClearLoDTensorArrayInLocalScope(); } - // return Fetch Tensors auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName); if (fetch_var) { @@ -528,6 +530,17 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_); Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope() : var_scope_.GetMutableScope(); + +#ifdef PADDLE_WITH_ASCEND_CL + // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable + // values, but only through special `float_status` to checks whether + // the operation is overflow. More about `float_status`, see: + // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue + if (FLAGS_check_nan_inf) { + framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place); + } +#endif + auto op_with_kernel = dynamic_cast(op); { // If it is OperatorBase, InferShape do nothing. diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 1a539c1ce1cea0350e9a895e4d25a606e081ee4a..acbcf1da4c5e3e4fddf1e5aad074f3e4d2ca8fdf 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -15,6 +15,7 @@ #include +#include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/new_executor/data_transfer.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" @@ -43,6 +44,7 @@ PADDLE_DEFINE_EXPORTED_bool( "Enable serial execution for standalone executor, used for debug."); DECLARE_bool(use_mkldnn); +DECLARE_bool(check_nan_inf); namespace paddle { namespace framework { @@ -446,11 +448,19 @@ void build_op_func_list(const platform::Place& place, op_func_node.output_index = outs_name2id; VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope); +#ifdef PADDLE_WITH_ASCEND_CL + // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable + // values, but only through special `float_status` to checks whether + // the operation is overflow. More about `float_status`, see: + // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue + if (FLAGS_check_nan_inf) { + framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place); + } +#endif + if (dynamic_cast(op) == nullptr) { // op is not a operatorwithkernel, so direcly run OperatorBase::Run() deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope); - VLOG(4) << "End run " << place << " " - << op_func_node.operator_base_->DebugStringEx(local_scope); } else { auto op_with_kernel = const_cast( static_cast(op)); @@ -593,6 +603,12 @@ void build_op_func_list(const platform::Place& place, << var_scope->GetNameById(p.second); } } + + // for debug nan/inf + if (FLAGS_check_nan_inf) { + VLOG(4) << "Check nan/inf"; + framework::details::CheckOpHasNanOrInf(*op, *runtime_scope, place); + } } VLOG(4) << "End run " << place << " " @@ -768,12 +784,7 @@ void ShrinkDownstreamMap(std::map>* downstream_map, // b: c // happens_before[i][j] means i should be executed before j - op_happens_before->resize(op_num); - for (size_t i = 0; i < op_num; ++i) { - (*op_happens_before)[i].resize(op_num); - std::fill( - (*op_happens_before)[i].begin(), (*op_happens_before)[i].end(), false); - } + op_happens_before->assign(op_num, std::vector(op_num, false)); // bfs to get all next ops auto bfs = [&](size_t op_idx) { @@ -883,6 +894,18 @@ std::map> build_op_downstream_map( } } } + // the original output of inplace op is also change. + if (!vec_instruction[op_idx].InplaceBackMap().empty()) { + auto& m = vec_instruction[op_idx].InplaceBackMap(); + for (auto& p : m) { + auto& var = p.second; + if (var2min_rw_op.count(var)) { + for (auto dep_op : var2min_rw_op[var]) { + op2dependences[op_idx].insert(dep_op); + } + } + } + } // step2: update 2 var2xxxx data structure for (auto& item : @@ -894,16 +917,6 @@ std::map> build_op_downstream_map( } } - for (auto& item : - vec_instruction[op_idx].Inputs()) { // for all inputs(read only) - for (auto var : item.second) { - if (remove_duplicate.count(var) == - 0) { // var in input list and in output list, so remove it. - update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); - } - } - } - // NOTE(zhiqiu): The inplace op with `transfer` also changes // original output after that so add original output as well // original: a->op->a @@ -914,8 +927,16 @@ std::map> build_op_downstream_map( for (auto& p : m) { auto var = p.second; var2recent_write_op[var] = op_idx; - // var in input list and in output list, so remove it. - if (remove_duplicate.count(var) == 0) { + var2min_rw_op[var] = {static_cast(op_idx)}; + remove_duplicate.insert(var); + } + } + + for (auto& item : + vec_instruction[op_idx].Inputs()) { // for all inputs(read only) + for (auto var : item.second) { + if (remove_duplicate.count(var) == + 0) { // var in input list and in output list, so remove it. update_var_min_rw_op(op2dependences, &var2min_rw_op, op_idx, var); } } diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 70a92f0ae28aed3240fbeee62c5fdc7133dcbcb3..af3951f4538f12f035fdc0e5944c75ff33fb63f8 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -389,7 +389,8 @@ static bool IsCpuOp(const Instruction& instr) { // is supported heterogeneous place static bool IsSupportedHetePlace(const phi::Place& place) { - return platform::is_gpu_place(place) || platform::is_xpu_place(place); + return platform::is_gpu_place(place) || platform::is_npu_place(place) || + platform::is_xpu_place(place); } } // namespace interpreter diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index b7a7e4c0b546ff76b813e6c2465e57bed6bca632..086dac8dac1fbf2ce82cc31089ceb57933b4415e 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -21,23 +21,37 @@ namespace paddle { namespace framework { +namespace { +std::map>>* + d2h_ctxs = nullptr; +std::map>>* + h2d_ctxs = nullptr; +std::mutex ctx_mtx; +} // namespace StreamAnalyzer::StreamAnalyzer(const platform::Place& place) : place_(place) { - if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - platform::EmplaceDeviceContexts( - &d2h_ctxs_, - {place}, - /*disable_setting_default_stream_for_allocator=*/true); - platform::EmplaceDeviceContexts( - &h2d_ctxs_, - {place}, - /*disable_setting_default_stream_for_allocator=*/true); -#else - PADDLE_THROW( - platform::errors::Unimplemented("CUDAPlace is not supported. Please " - "re-compile with WITH_GPU option.")); -#endif + if (platform::is_gpu_place(place) || platform::is_npu_place(place)) { + std::lock_guard lk(ctx_mtx); + if (d2h_ctxs == nullptr) { + d2h_ctxs = new std::map< + Place, + std::shared_future>>(); + h2d_ctxs = new std::map< + Place, + std::shared_future>>(); + } + if (d2h_ctxs->find(place) == d2h_ctxs->end()) { + platform::EmplaceDeviceContexts( + d2h_ctxs, + {place}, + /*disable_setting_default_stream_for_allocator=*/true); + platform::EmplaceDeviceContexts( + h2d_ctxs, + {place}, + /*disable_setting_default_stream_for_allocator=*/true); + } + d2h_ctx_ = (*d2h_ctxs)[place]; + h2d_ctx_ = (*h2d_ctxs)[place]; } } @@ -162,15 +176,15 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext( const OpFuncNode& op_func_node) { auto& op_type = op_func_node.operator_base_->Type(); auto* dev_ctx = op_func_node.dev_ctx_; - // only gpu need update. xpu not need, because xpu memcpy op kernel is + // only gpu/npu need update. xpu not need, because xpu memcpy op kernel is // synchronous. - if (platform::is_gpu_place(place_)) { + if (platform::is_gpu_place(place_) || platform::is_npu_place(place_)) { if (op_type == interpreter::kMemcpyD2H) { VLOG(3) << "Get dev_ctx from d2h_context_pool_"; - dev_ctx = d2h_ctxs_[place_].get().get(); + dev_ctx = d2h_ctx_.get().get(); } else if (op_type == interpreter::kMemcpyH2D) { VLOG(3) << "Get dev_ctx from h2d_context_pool_"; - dev_ctx = h2d_ctxs_[place_].get().get(); + dev_ctx = h2d_ctx_.get().get(); } } return dev_ctx; @@ -188,11 +202,20 @@ platform::DeviceContext* StreamAnalyzer::ParseDeviceContext( */ bool StreamAnalyzer::IsDirectRun(Instruction& cur_instr, const Instruction& next_instr) { - return platform::is_xpu_place(place_) || - (&cur_instr.DeviceContext() == &next_instr.DeviceContext() || - interpreter::IsCpuOp(cur_instr) || - interpreter::IsMemcpyD2H(cur_instr) || - interpreter::IsMemcpyH2D(next_instr)); + if (&cur_instr.DeviceContext() == &next_instr.DeviceContext()) return true; + + // xpu memcpy kerenl is synchronous. + if (platform::is_xpu_place(place_)) return true; + + // npu d2h kernel is asynchronous. + if (platform::is_npu_place(place_)) { + return interpreter::IsCpuOp(cur_instr) || + interpreter::IsMemcpyH2D(next_instr); + } + // gpu or cpu + return interpreter::IsCpuOp(cur_instr) || + interpreter::IsMemcpyD2H(cur_instr) || + interpreter::IsMemcpyH2D(next_instr); } platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) { @@ -201,6 +224,8 @@ platform::DeviceType StreamAnalyzer::GetWaiterType(const Instruction& instr) { } else { if (platform::is_xpu_place(place_)) { return platform::kXPU; + } else if (platform::is_npu_place(place_)) { + return platform::kNPU; } return platform::kCUDA; } diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.h b/paddle/fluid/framework/new_executor/stream_analyzer.h index 61e37bbb686fcd3f111680d0ed77b41ad12ee8cd..4be8ffe6bb4caeb91d91214aad630f7b1abfee6d 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.h +++ b/paddle/fluid/framework/new_executor/stream_analyzer.h @@ -53,9 +53,9 @@ class StreamAnalyzer { platform::DeviceType GetWaiterType(const Instruction& instr); - Place place_; - std::map>> d2h_ctxs_; - std::map>> h2d_ctxs_; + const Place place_; + std::shared_future> d2h_ctx_; + std::shared_future> h2d_ctx_; std::map> var_id2event_; }; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 4364934a4027d9022e5939fc8db0a52ce7d3d5d8..917cebc11f9a904191591f7eac60cfce0ea531d7 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -1080,11 +1080,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, } else { return m->GetAllocator(p, size)->Allocate(size); } -#elif defined PADDLE_WITH_XPU +#elif defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) return GetAllocator(place)->Allocate(size); #else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with GPU or XPU.")); + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Not compiled with GPU or XPU or NPU.")); #endif } diff --git a/paddle/fluid/operators/crop_op_npu.cc b/paddle/fluid/operators/crop_op_npu.cc index 6c4c6eb25d8204d9429a1da0617458e5cd9481ab..bd50dea15f80e9b58321b825cf936f778f6a1a43 100644 --- a/paddle/fluid/operators/crop_op_npu.cc +++ b/paddle/fluid/operators/crop_op_npu.cc @@ -70,8 +70,12 @@ class CropNPUKernel : public framework::OpKernel { shape->dims().size(), x->dims().size())); + // shape memory maybe have gc. + Tensor tmp_shape(*shape); + tmp_shape.mutable_data(ctx.GetPlace()); + const auto& runner = - NpuOpRunner("Crop", {*x, *shape}, {*out}, attr_input); + NpuOpRunner("Crop", {*x, tmp_shape}, {*out}, attr_input); auto stream = ctx.template device_context() .stream(); diff --git a/paddle/fluid/operators/memcpy_h2d_op.cc b/paddle/fluid/operators/memcpy_h2d_op.cc index 411841c4502fa4482655853d45588e86c8e38c97..98ed68cf84f87dc5cdda4318549d6d353679502e 100644 --- a/paddle/fluid/operators/memcpy_h2d_op.cc +++ b/paddle/fluid/operators/memcpy_h2d_op.cc @@ -94,14 +94,13 @@ class MemcpyH2DOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(LoDTensor) The type of output " "is the same as input X."); - AddAttr( - "dst_place_type", - "Determine the dst place of tensor copy. " - "By Now it ONLY support CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace " - "Other place type is Unimplemented and will cause ERROR." - "0: dst is on CUDAPlace. " - "1: dst is on NPUPlace. " - "2: dst is on XPUPlace. "); + AddAttr("dst_place_type", + "Determine the dst place of tensor copy. " + "By Now it support:" + "0. CUDAPinnedPlace/CPU <->CUDAPlace" + "1. NPUPinnedPlace/CPU <-> NPUPlace" + "2. CPU <->XPUPlace" + "Other place type is Unimplemented and will cause ERROR."); AddComment(R"DOC( MemcpyD2H Operator. By now, it ONLY supports the memcopy between CUDAPinnedPlace/CPU <-> NPUPlace/CUDAPlace. diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index efe0479871215a593306f1edb2b5f2d987ffd74d..b00e4056259d93face5ab11304388c18d4956fe8 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -280,6 +280,16 @@ if(WITH_XPU) CACHE INTERNAL "device event libs") endif() +if(WITH_ASCEND_CL) + cc_library( + device_event_npu + SRCS device_event_npu.cc + DEPS device_event_base npu_resource_pool) + set(DEVICE_EVENT_LIBS + device_event_npu + CACHE INTERNAL "device event libs") +endif() + if(WITH_GPU) nv_library( device_event_gpu diff --git a/paddle/fluid/platform/device/npu/npu_info.cc b/paddle/fluid/platform/device/npu/npu_info.cc index 362c4e8fae8b1368245bfe6f95d7e0c1adc44e2c..9acdef985ade20004d88ed9a1ea2d6b25527592d 100644 --- a/paddle/fluid/platform/device/npu/npu_info.cc +++ b/paddle/fluid/platform/device/npu/npu_info.cc @@ -285,6 +285,10 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status) { PADDLE_ENFORCE_NPU_SUCCESS(aclrtQueryEvent(event, status)); } +void NPUEventSynchronize(aclrtEvent event) { + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeEvent(event)); +} + void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event) { PADDLE_ENFORCE_NPU_SUCCESS(aclrtStreamWaitEvent(stream, event)); } diff --git a/paddle/fluid/platform/device/npu/npu_info.h b/paddle/fluid/platform/device/npu/npu_info.h index f7af1c246ef6c2afc1c1caf0013796cac6ea3089..ea55831db2e225ae2b1accf8ce589deff47f1e8d 100644 --- a/paddle/fluid/platform/device/npu/npu_info.h +++ b/paddle/fluid/platform/device/npu/npu_info.h @@ -138,6 +138,9 @@ void NPUEventQuery(aclrtEvent event, aclrtEventStatus *status); //! Record NPU event in the stream. void NPUEventRecord(aclrtEvent event, aclrtStream stream); +//! Synchronize NPU event. +void NPUEventSynchronize(aclrtEvent event); + //! Makes a stream wait on an event. void NPUStreamWaitEvent(aclrtStream stream, aclrtEvent event); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a668d7f4b8366d8240f3974275e3afe28bc3f242..6bceb696c0f8e18c36472be34eb8efb2a471cd85 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -125,6 +125,8 @@ DeviceType Place2DeviceType(const platform::Place& place) { return platform::DeviceType::XPU; } else if (platform::is_ipu_place(place)) { return platform::DeviceType::IPU; + } else if (platform::is_npu_place(place)) { + return platform::DeviceType::NPU; } else if (platform::is_mlu_place(place)) { return platform::DeviceType::MLU; } else { diff --git a/paddle/fluid/platform/device_event.h b/paddle/fluid/platform/device_event.h index 1fd116600624ca58c469f9460644ebef76a94923..2edccfa90c9395236c74ce0292664a3d581e2ba1 100644 --- a/paddle/fluid/platform/device_event.h +++ b/paddle/fluid/platform/device_event.h @@ -25,6 +25,7 @@ using ::paddle::platform::kCPU; using ::paddle::platform::kCUDA; +using ::paddle::platform::kNPU; using ::paddle::platform::kXPU; USE_EVENT(kCPU) @@ -41,3 +42,9 @@ USE_EVENT(kXPU); USE_EVENT_WAIT(kXPU, kXPU) USE_EVENT_WAIT(kCPU, kXPU) #endif + +#ifdef PADDLE_WITH_ASCEND_CL +USE_EVENT(kNPU); +USE_EVENT_WAIT(kNPU, kNPU) +USE_EVENT_WAIT(kCPU, kNPU) +#endif diff --git a/paddle/fluid/platform/device_event_base.h b/paddle/fluid/platform/device_event_base.h index b42721a60d974a9f353adc0c4b56c817f24f8fbf..51df0fd4f40adc42780333cc7fd90bb39634ac6e 100644 --- a/paddle/fluid/platform/device_event_base.h +++ b/paddle/fluid/platform/device_event_base.h @@ -66,7 +66,7 @@ class DeviceEvent { type_id_)); // TODO(Aurelius84): only support CPU/CUDA, need consider XPU/NPU later PADDLE_ENFORCE_LT(type_id_, - 3, + 4, platform::errors::Unavailable( "Currently DeviceEvent do not support %s", place)); PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/platform/device_event_npu.cc b/paddle/fluid/platform/device_event_npu.cc new file mode 100644 index 0000000000000000000000000000000000000000..215f308f663489477666f563435b2ca508f04117 --- /dev/null +++ b/paddle/fluid/platform/device_event_npu.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_ASCEND_CL + +#include "paddle/fluid/platform/device/npu/npu_resource_pool.h" +#include "paddle/fluid/platform/device_event_base.h" +#include "paddle/fluid/platform/event.h" +namespace paddle { +namespace platform { +struct NPUDeviceEventWrapper { + explicit NPUDeviceEventWrapper(const platform::Place& place) { + PADDLE_ENFORCE_EQ( + platform::is_npu_place(place), + true, + platform::errors::PreconditionNotMet( + "Required device shall be NPUPlace, but received %d. ", place)); + + device_id_ = place.device; + PADDLE_ENFORCE_GT( + device_id_, + -1, + platform::errors::PreconditionNotMet( + "Required DeviceOption.device_id > -1, but received %d. ", + device_id_)); + inner_event_ = NpuEventResourcePool::Instance().New(device_id_); + } + std::shared_ptr inner_event_; + int device_id_; +}; + +void DeviceEventCreateNPU(DeviceEvent* event, + const platform::Place& place, + unsigned int) { + event->InitEvent(std::make_shared(place)); +} + +void DeviceEventRecordNPU(DeviceEvent* event, const DeviceContext* context) { + auto* wrapper = static_cast(event->GetEvent().get()); + auto* npu_dev_ctx = dynamic_cast(context); + PADDLE_ENFORCE_NOT_NULL( + npu_dev_ctx, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast context into NPUDeviceContext.")); + NPUEventRecord(wrapper->inner_event_.get(), npu_dev_ctx->stream()); +} + +bool DeviceEventQueryNPU(const DeviceEvent* event) { + auto* wrapper = static_cast(event->GetEvent().get()); + PADDLE_ENFORCE_NOT_NULL( + wrapper, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast event into NPUDeviceEventWrapper.")); + aclrtEventStatus status = ACL_EVENT_STATUS_COMPLETE; + platform::NPUEventQuery(wrapper->inner_event_.get(), &status); + return ACL_EVENT_STATUS_COMPLETE == status; +} + +void DeviceEventFinishNPU(const DeviceEvent* event) { + auto* wrapper = static_cast(event->GetEvent().get()); + NPUEventSynchronize(wrapper->inner_event_.get()); +} + +void DeviceEventNPUWaitNPU(const DeviceEvent* event, + const DeviceContext* context) { + auto* wrapper = static_cast(event->GetEvent().get()); + auto* npu_dev_ctx = dynamic_cast(context); + PADDLE_ENFORCE_NOT_NULL( + npu_dev_ctx, + platform::errors::PreconditionNotMet( + "Failed to dynamic_cast context into NPUDeviceContext.")); + NPUStreamWaitEvent(npu_dev_ctx->stream(), wrapper->inner_event_.get()); +} + +void DeviceEventCPUWaitNPU(const DeviceEvent* event, + const DeviceContext* context) { + DeviceEventFinishNPU(event); +} + +void DeviceEventSetFinishedNPU(const DeviceEvent* event) { + // do nothing +} + +void EventResetNPU(const DeviceEvent* event) { + // do nothing +} + +} // namespace platform +} // namespace paddle + +using ::paddle::platform::kCPU; +using ::paddle::platform::kNPU; +REGISTER_EVENT_CREATE_FUNCTION(kNPU, paddle::platform::DeviceEventCreateNPU) +REGISTER_EVENT_RECORD_FUNCTION(kNPU, paddle::platform::DeviceEventRecordNPU) +REGISTER_EVENT_QUERY_FUNCTION(kNPU, paddle::platform::DeviceEventQueryNPU) +REGISTER_EVENT_FINISH_FUNCTION(kNPU, paddle::platform::DeviceEventFinishNPU) +REGISTER_EVENT_SET_FINISHED_FUNCTION( + kNPU, paddle::platform::DeviceEventSetFinishedNPU) +REGISTER_EVENT_WAIT_FUNCTION(kNPU, + kNPU, + paddle::platform::DeviceEventNPUWaitNPU) +REGISTER_EVENT_WAIT_FUNCTION(kCPU, + kNPU, + paddle::platform::DeviceEventCPUWaitNPU) +REGISTER_EVENT_RESET_FUNCTION(kNPU, paddle::platform::EventResetNPU) +#endif diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 3303b6c9472ff9bcac0ab8da822c1cdfcd0635b5..5f80e3b7577707c68e4439db309edc835afade4f 100755 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1400,9 +1400,8 @@ class Executor(object): program = pruned_program def _can_use_interpreter_core(program, place): - if core.is_compiled_with_npu() or core.is_compiled_with_mlu( - ) or core.is_compiled_with_ipu() or isinstance( - place, core.CustomPlace): + if core.is_compiled_with_mlu() or core.is_compiled_with_ipu( + ) or isinstance(place, core.CustomPlace): return False compiled = isinstance(program, compiler.CompiledProgram)