// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/new_executor/interpretercore.h" #include #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/new_executor/interpretercore_event_garbage_collector.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/profiler.h" #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/fluid/framework/new_executor/interpretercore_fast_garbage_collector.h" #endif PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true, "Use inplace in new executor"); PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope, true, "Use local_scope in new executor(especially used " "in UT), can turn off for better performance"); DECLARE_bool(check_nan_inf); DECLARE_bool(benchmark); DECLARE_bool(fast_eager_deletion_mode); DECLARE_bool(use_stream_safe_cuda_allocator); constexpr const char* kExceptionCaught = "ExceptionCaught"; constexpr const char* kTaskCompletion = "TaskCompletion"; namespace paddle { namespace framework { // NOTE(Aurelius84): Need a better strategy to determine it. static constexpr size_t kHostNumThreads = 4; bool IsInterpretercoreFastGCEnabled() { return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator; } InterpreterCore::InterpreterCore(const platform::Place& place, const BlockDesc& block, VariableScope* global_scope) : place_(place), block_(block), global_scope_(global_scope), stream_analyzer_(place) { is_build_ = false; async_work_queue_.reset( new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_)); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (IsInterpretercoreFastGCEnabled()) { gc_ = std::make_unique(); } else { gc_ = std::make_unique(); } #else gc_ = std::make_unique(); #endif exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught); completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion); create_local_scope_ = FLAGS_new_executor_use_local_scope; if (FLAGS_new_executor_use_local_scope) { auto local_scope = &global_scope->GetMutableScope()->NewScope(); local_scope->AddListener(global_scope->Listener()); local_scope_ = local_scope; } VLOG(4) << "create_local_scope_ is " << create_local_scope_; // prune // optmize graph pass // convert to run graph } InterpreterCore::~InterpreterCore() { // cancle gc's thread gc_.reset(nullptr); exception_notifier_->UnregisterEvent(); completion_notifier_->UnregisterEvent(); async_work_queue_.reset(nullptr); } void InterpreterCore::SetCopyProgram(std::shared_ptr prog) { copy_program_ = prog; } paddle::framework::FetchList InterpreterCore::Run( const std::vector& feed_names, const std::vector& feed_tensors) { bool is_build = is_build_; global_scope_->SetLocalScope(local_scope_); Prepare(feed_names, feed_tensors, is_build); if (is_build) { ExecuteInstructionList(vec_instruction_); } if (create_local_scope_) { ClearLoDTensorArrayInLocalScope(); } // return Fetch Tensors auto* fetch_var = global_scope_->Var(interpreter::kFetchVarName); return std::move(*fetch_var->GetMutable()); } paddle::framework::FetchList InterpreterCore::Run( const std::vector& feed_names) { if (!is_build_) { if (create_local_scope_ && global_scope_->GetMutableLocalScope() != global_scope_->GetMutableScope() && global_scope_->GetMutableLocalScope()) { VLOG(4) << "Clear previous local scope before run"; VLOG(4) << global_scope_->GetMutableScope() << " " << global_scope_->GetMutableLocalScope(); platform::DeviceContextPool::Instance().Get(place_)->Wait(); // TODO(zhiqiu): clear the tensor holder of all vars in previous local // scope? } global_scope_->SetLocalScope(local_scope_); paddle::framework::interpreter::build_variable_scope(block_, global_scope_, create_local_scope_); std::vector op_func_nodes; paddle::framework::interpreter::build_op_func_list( place_, block_, &op_func_nodes, global_scope_, create_local_scope_); is_build_ = true; SetFeedVarsInplaceSkip(feed_names); // convert vec func_list to graph Convert(&op_func_nodes); } else { ExecuteInstructionList(vec_instruction_); } if (create_local_scope_) { ClearLoDTensorArrayInLocalScope(); } // return Fetch Tensors auto* fetch_var = global_scope_->Var(interpreter::kFetchVarName); return std::move(*fetch_var->GetMutable()); } // At the end of each step, the holder of Tensor in LoDTensorArray is null. // Clear these Tensors and leave LoDTensorArray empty, otherwise an exception // will occur in the next step void InterpreterCore::ClearLoDTensorArrayInLocalScope() { auto vars = local_scope_->LocalVars(); for (auto var : vars) { if (var->IsType()) { auto* lod_tensor_arr = var->GetMutable(); lod_tensor_arr->clear(); } } } void InterpreterCore::BuildOperatorDependences() { // analysis the dependences between ops, set the dependecy_count_ and Call // Schedule auto op_nums = vec_instruction_.size(); dependecy_count_.resize(op_nums); auto op2downstream = interpreter::build_op_downstream_map(vec_instruction_); for (size_t op = 0; op < vec_instruction_.size(); ++op) { auto op_list = op2downstream[op]; std::vector downsteam_vector(op_list.begin(), op_list.end()); stream_analyzer_.Schedule(downsteam_vector, &vec_instruction_, op); for (auto inst_id : op_list) { dependecy_count_[inst_id]++; } } } void InterpreterCore::Convert( std::vector* op_func_nodes) { auto& vec_meta_info = global_scope_->MutableVecMetaInfo(); auto var_nums = global_scope_->VarSize(); input_var2op_info_.resize(var_nums); auto nodes = *op_func_nodes; auto op_nums = nodes.size(); vec_instruction_.reserve(op_nums); for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) { auto& op_func_node = nodes[op_idx]; auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node); vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_); auto& instr = vec_instruction_.back(); OpInOutInfo info; std::vector gc_check_input_list; for (auto& item : op_func_node.input_index) { for (auto id : item.second) { if (id == kEmptyVarIndex) { continue; } input_var2op_info_.at(id).push_back(op_idx); // var can be gc-ed if (!info.IsBuilt()) { info.Build(op_func_node.operator_base_.get()); } auto* var_desc = global_scope_->VarDesc(id); if (var_desc) { if (info.IsInArgBufferNeeded(var_desc->Name())) { gc_check_input_list.push_back(id); } } else { gc_check_input_list.push_back(id); } } } std::sort(gc_check_input_list.begin(), gc_check_input_list.end()); auto last = std::unique(gc_check_input_list.begin(), gc_check_input_list.end()); gc_check_input_list.erase(last, gc_check_input_list.end()); for (auto var_id : gc_check_input_list) { vec_meta_info[var_id].var_ref_count_++; instr.AddGCCheckVar(var_id); VLOG(4) << "clear " << global_scope_->GetNameById(var_id) << " after " << instr.OpBase()->Type(); } } for (size_t i = 0; i < vec_instruction_.size(); ++i) { // checkout ouput for (auto& item : vec_instruction_[i].Outputs()) { for (auto id : item.second) { if (input_var2op_info_.at(id).size() == 0) { // output var not be used by any kernel vec_instruction_[i].AddGCCheckVar(id); VLOG(4) << "clear " << global_scope_->GetNameById(id) << " after " << vec_instruction_[i].OpBase()->Type(); vec_meta_info[id].var_ref_count_++; } } } } BuildOperatorDependences(); for (size_t i = 0; i < vec_instruction_.size(); ++i) { BuildAndCacheInstructionCtx(&vec_instruction_[i]); } BuildSkipShareLoDInfo(); for (size_t i = 0; i < vec_instruction_.size(); ++i) { gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(), platform::GenerateDeviceEventFlag()); } if (FLAGS_new_executor_use_inplace) { BuildInplace(); } } bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) { if (!global_scope_->VarDesc(var_index)) { return input_var2op_info_.at(var_index).size() == 1; } else { int is_input_cnt = 0; for (auto inst_id : input_var2op_info_.at(var_index)) { OpInOutInfo info; info.Build(vec_instruction_.at(inst_id).OpBase()); if (info.IsInArgBufferNeeded(global_scope_->VarDesc(var_index)->Name())) { is_input_cnt++; } } return is_input_cnt == 1; } } void InterpreterCore::BuildInplace() { for (size_t i = 0; i < vec_instruction_.size(); ++i) { auto& instr = vec_instruction_[i]; auto* op_base = instr.OpBase(); if (!op_base->Info().infer_inplace_) { continue; } auto in_to_outs = op_base->Info().infer_inplace_( platform::is_gpu_place(instr.DeviceContext().GetPlace())); auto& inputs = instr.Inputs(); auto& outputs = instr.Outputs(); for (auto& pair : in_to_outs) { auto iter = inputs.find(pair.first); if (iter != inputs.end() && !iter->second.empty()) { auto in_var_desc = global_scope_->VarDesc(iter->second[0]); if (in_var_desc && in_var_desc->Persistable()) { continue; } if (global_scope_->GetVarSikpInplace(iter->second[0])) { continue; } if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) { auto iterout = outputs.find(pair.second); if (iterout != outputs.end() && !iterout->second.empty()) { auto invar = global_scope_->Var(iter->second[0]); auto outvar = global_scope_->Var(iterout->second[0]); if (invar && outvar && invar->IsType() && outvar->IsType()) { instr.AddInplace(invar, outvar); VLOG(3) << "inplace " << vec_instruction_[i].OpBase()->Type() << " " << global_scope_->GetNameById(iter->second[0]) << " -> " << global_scope_->GetNameById(iterout->second[0]) << std::endl; } } } } } } } void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) { VariableValueMap ins_map; for (auto& var_name_item : instr_node->Inputs()) { std::vector input_vars; input_vars.reserve(var_name_item.second.size()); for (auto& id : var_name_item.second) { input_vars.emplace_back(global_scope_->Var(id)); } ins_map.emplace(var_name_item.first, std::move(input_vars)); } VariableValueMap outs_map; for (auto& var_name_item : instr_node->Outputs()) { std::vector out_vars; out_vars.reserve(var_name_item.second.size()); for (auto& id : var_name_item.second) { out_vars.emplace_back(global_scope_->Var(id)); } outs_map.emplace(var_name_item.first, std::move(out_vars)); } // set runtime_ctx and infershape_ctx_ instr_node->ResetContext(ins_map, outs_map); } void InterpreterCore::BuildSkipShareLoDInfo() { for (size_t i = 0; i < vec_instruction_.size(); ++i) { bool can_skip_lod = true; for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) { for (auto& var : input.second) { if (var->IsType()) { if (var->Get().lod().size() != 0) { can_skip_lod = false; break; } } else { can_skip_lod = false; break; } } } vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod); } } void InterpreterCore::RunInstruction(const Instruction& instr_node) { auto* op = instr_node.OpBase(); auto place = instr_node.DeviceContext().GetPlace(); VLOG(4) << "Start run " << place << " " << op->DebugStringEx(global_scope_); Scope* local_scope = create_local_scope_ ? global_scope_->GetMutableLocalScope() : global_scope_->GetMutableScope(); auto op_with_kernel = dynamic_cast(op); { platform::RecordEvent infershape_event("InferShape"); // If it is OperatorBase, InferShape do nothing. if (op_with_kernel != nullptr) op_with_kernel->InferShape(instr_node.InnerInferShapeContext().get()); } if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) { // TODO(xiongkun03) Does operator // base support inplace ? for (auto& pair : instr_node.InplaceInfo()) { const auto& in = paddle::framework::details::GetTensorFromVar(pair.first); auto* out = paddle::framework::details::GetMutableTensorFromVar(pair.second); if (in.dims() == out->dims()) { out->ShareBufferWith(in); } } } { platform::RecordEvent compute_event("Compute"); if (op_with_kernel == nullptr) { instr_node.OpBase()->Run(*local_scope, place_); } else { instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get()); } } VLOG(4) << "End run " << place << " " << op->DebugStringEx(global_scope_); /*For profiling/benchmark only*/ if (FLAGS_benchmark) { instr_node.DeviceContext().Wait(); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError()); VLOG(4) << "Operator(" << op->Type() << "): context wait and get last error"; #endif } // for debug nan/inf if (FLAGS_check_nan_inf) { VLOG(4) << "Check nan/inf"; framework::details::CheckOpHasNanOrInf( *op, *global_scope_, place); // TODO(xiongkun03) change it to inner scope. } } void InterpreterCore::ExecuteInstructionList( const std::vector& vec_instr) { async_work_queue_->PrepareAtomicDeps(dependecy_count_); async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo()); unfinished_op_numer_ = vec_instr.size(); exception_holder_.Clear(); for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [&, i] { RunInstructionAsync(i); }); } } auto event_name = main_thread_blocker_.WaitEvent(); VLOG(3) << "event_name: " << event_name; if (UNLIKELY(exception_holder_.IsCaught())) { VLOG(4) << "Exception caught " << exception_holder_.Type(); async_work_queue_->Cancel(); exception_holder_.ReThrow(); } } void InterpreterCore::RunNextInstructions( const Instruction& instr, std::queue* reserved_next_ops) { auto& next_instr = instr.NextInstructions(); auto& atomic_deps = async_work_queue_->AtomicDeps(); auto IsReady = [&](size_t next_id) { return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1; }; if (instr.KernelType() == OpFuncType::kQueueAsync) { // move all sync_ops into other threads for (auto next_id : next_instr.SyncRunIds()) { if (IsReady(next_id)) { async_work_queue_->AddTask( vec_instruction_[next_id].KernelType(), [&, next_id] { RunInstructionAsync(next_id); }); } } // keep all async_ops running in current thread for (auto next_id : next_instr.DirectRunIds()) { if (IsReady(next_id)) { reserved_next_ops->push(next_id); } } for (auto next_id : next_instr.EventRunIds()) { if (IsReady(next_id)) { reserved_next_ops->push(next_id); } } } else { // move async_ops into async_thread for (auto next_id : next_instr.EventRunIds()) { if (IsReady(next_id)) { async_work_queue_->AddTask( vec_instruction_[next_id].KernelType(), [&, next_id] { RunInstructionAsync(next_id); }); } } auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(), next_instr.DirectRunIds()); size_t first_op = 0; for (auto next_id : direct_run_ops) { if (IsReady(next_id)) { // only keep one op running in current thread if (first_op == 0) { first_op = next_id; continue; } // move rest ops into other threads async_work_queue_->AddTask( vec_instruction_[next_id].KernelType(), [&, next_id] { RunInstructionAsync(next_id); }); } } if (first_op != 0) reserved_next_ops->push(first_op); } } void InterpreterCore::RunInstructionAsync(size_t instr_id) { std::queue ready_ops; ready_ops.push(instr_id); while (!ready_ops.empty()) { instr_id = ready_ops.front(); ready_ops.pop(); auto& instr_node = vec_instruction_.at(instr_id); auto* op = instr_node.OpBase(); platform::RecordEvent instruction_event(op->Type().c_str()); interpreter::WaitEvent(instr_node, place_); try { RunInstruction(instr_node); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) RecordStreamForGC(instr_node); #endif CheckGC(instr_node); } catch (platform::EnforceNotMet& ex) { framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex); exception_holder_.Catch(std::make_exception_ptr(std::move(ex))); } catch (platform::EOFException&) { exception_holder_.Catch(std::current_exception()); } catch (std::exception& ex) { LOG(WARNING) << op->Type() << " raises an exception " << platform::demangle(typeid(ex).name()) << ", " << ex.what(); exception_holder_.Catch(std::current_exception()); } catch (...) { LOG(WARNING) << op->Type() << " raises an unknown exception"; exception_holder_.Catch(std::current_exception()); } if (UNLIKELY(exception_holder_.IsCaught())) { VLOG(4) << "Exception caught"; if (exception_notifier_ != nullptr) { exception_notifier_->NotifyEvent(); } return; } VLOG(4) << "unfinished_op_numer_: " << unfinished_op_numer_; if (UNLIKELY(unfinished_op_numer_.fetch_sub(1, std::memory_order_relaxed) == 1)) { if (completion_notifier_ != nullptr) { completion_notifier_->NotifyEvent(); } } interpreter::RecordEvent(instr_node, place_); RunNextInstructions(instr_node, &ready_ops); } } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) void InterpreterCore::RecordStreamForGC(const Instruction& instr) { if (!IsInterpretercoreFastGCEnabled() || instr.KernelType() != OpFuncType::kQueueAsync) { return; } gpuStream_t stream = reinterpret_cast( instr.DeviceContext()) .stream(); auto TensorRecordStream = [&stream](Tensor& tensor) { auto allocation = tensor.Holder(); if (allocation == nullptr) { return; } const platform::Place& place = allocation->place(); if (platform::is_gpu_place(place)) { memory::RecordStream(allocation, stream); } else if (platform::is_cuda_pinned_place(place)) { // TODO(Ruibiao): Here should do something to make sure that the tensor is // not freed until the H2D copies done. However, simplely launch a CUDA // runtime callback to the H2D stream may lead a high performance // overhead. As all the cases we meet in H2D are copies from CPUPlace at // present, we just log a WARNING here. A better design is required. LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous " "manner may lead a data inconsistent"; } else { // memory copies involve CPUPlace are always synchronous, so just do // nothing here } }; /* NOTE(Ruibiao)：Cross-stream tensor synchronization is required only when * all the following conditions are satisfied: * 1. The tensor will be GC after running the instruction, i.e., in * instr.GCCheckVars. * 2. The stream which initializes this tensor is different from the stream * which the instruction run in. * 3. The tensor is the instruction's input, cause we assume that instruction * will initialize all output tensors with its running stream. * 4. In the OP function of this instruction, the tensor is an input of a * async CUDA kernel. * * Here we only process the first condition, because: * 1. Since the RecordStream function will directly return when the recored * stream is equal to the owning stream, recording a stream same as which * initialized this tensor has less time overhead. Conversely, it may take * more time if we try to extract those cross-stream input vars from * instr.GCCheckVars. * 2. Now the instruction has no idea of which vars involving async running in * OP function, and thus we can not recognize condition 4. It should be * supported later. */ for (int var_id : instr.GCCheckVars()) { VLOG(4) << "GC sync " << global_scope_->GetNameById(var_id) << " " << global_scope_->VarDesc(var_id); // persistable var will be ignore while GC if (global_scope_->VarDesc(var_id) && global_scope_->VarDesc(var_id)->Persistable()) { continue; } paddle::framework::Variable* var = global_scope_->Var(var_id); if (var == nullptr) { continue; } if (var->IsType()) { TensorRecordStream(*(var->GetMutable())); } else if (var->IsType< operators::reader:: OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) { // do nothing } else if (var->IsType()) { TensorRecordStream(*(var->GetMutable()->mutable_value())); } else if (var->IsType()) { auto* tensor_arr = var->GetMutable(); for (auto& tensor : *tensor_arr) { TensorRecordStream(tensor); } } else if (var->IsType>()) { // do nothing } else { PADDLE_THROW(platform::errors::Unimplemented( "The variable(%s) is not supported in eager deletion.", framework::ToTypeName(var->Type()))); } } } #endif void InterpreterCore::CheckGC(const Instruction& instr) { size_t instr_id = instr.Id(); auto& var_scope = *global_scope_; auto& atomic_var_ref = async_work_queue_->AtomicVarRef(); for (auto var_id : instr.GCCheckVars()) { VLOG(4) << "GC " << global_scope_->GetNameById(var_id) << " " << var_scope.VarDesc(var_id); bool is_ready = atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1; // ignore all persistable var while GC if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) { continue; } if (is_ready) { VLOG(6) << "Async delete variable with name : " << var_scope.GetNameById(var_id); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (IsInterpretercoreFastGCEnabled()) { static_cast(gc_.get())->Add( var_scope.Var(var_id)); } else { static_cast(gc_.get())->Add( var_scope.Var(var_id), gc_event_.at(instr_id), &instr.DeviceContext()); } #else static_cast(gc_.get())->Add( var_scope.Var(var_id), gc_event_.at(instr_id), &instr.DeviceContext()); #endif } } } void InterpreterCore::Prepare( const std::vector& feed_names, const std::vector& feed_tensors, bool prepare_feed) { PADDLE_ENFORCE_EQ(feed_names.size(), feed_tensors.size(), platform::errors::PreconditionNotMet( "Required feed_names.size() == feed_tensors.size(), " "but received %d != %d", feed_names.size(), feed_tensors.size())); auto FeedInput = [&] { VLOG(4) << "Feed inputs"; for (size_t i = 0; i < feed_names.size(); ++i) { auto* feed_var = global_scope_->FindVar(feed_names[i]); PADDLE_ENFORCE_NOT_NULL( feed_var, platform::errors::NotFound( "Variable %s should not be nullptr.", feed_names[i])); auto feed_tensor = feed_var->GetMutable(); feed_tensor->ShareDataWith(feed_tensors[i]); feed_tensor->set_lod(feed_tensors[i].lod()); } }; if (!is_build_) { paddle::framework::interpreter::build_variable_scope(block_, global_scope_, create_local_scope_); FeedInput(); std::vector op_func_nodes; paddle::framework::interpreter::build_op_func_list( place_, block_, &op_func_nodes, global_scope_, create_local_scope_); is_build_ = true; SetFeedVarsInplaceSkip(feed_names); // convert vec func_list to graph Convert(&op_func_nodes); } // NOTE: Because feed_tensor will be GC after // paddle::framework::build_op_func_list, so we should // call FeedInput again. if (prepare_feed) { FeedInput(); } } interpreter::CostInfo InterpreterCore::DryRun( const std::vector& feed_names, const std::vector& feed_tensors) { global_scope_->SetLocalScope(local_scope_); Prepare(feed_names, feed_tensors, true); interpreter::CostInfo cost_info; { interpreter::ProfilerGuard(place_, &cost_info); ExecuteInstructionList(vec_instruction_); platform::DeviceContextPool::Instance().Get(place_)->Wait(); } if (create_local_scope_) { ClearLoDTensorArrayInLocalScope(); } return cost_info; } void InterpreterCore::SetFeedVarsInplaceSkip( const std::vector& feed_names) { for (auto& feed_name : feed_names) { global_scope_->SetVarSikpInplace(feed_name, true); } } } // namespace framework } // namespace paddle