interpretercore.cc 41.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14

15
#include "paddle/fluid/framework/new_executor/interpretercore.h"
16

17
#include <unordered_set>
18

19 20
#include "gflags/gflags.h"

21
#include "paddle/fluid/framework/details/nan_inf_utils.h"
22
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
23
#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
24
#include "paddle/fluid/framework/operator.h"
25
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
L
liutiexing 已提交
26 27
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
28
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
29
#include "paddle/phi/common/place.h"
30
#include "paddle/phi/core/kernel_context.h"
L
Leo Chen 已提交
31 32 33
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
L
Leo Chen 已提交
34
#include "paddle/phi/backends/device_manager.h"
35

36 37 38 39 40 41 42 43 44
// The difference between "sequential_run" and "serial_run":
// "sequential_run" dispatches OPs one by one according to the sequence in the
// Program, while "serial_run" ensures that all Ops are scheduled in a singal
// thread. In standalone executor, "sequential_run" is also "serial_run", while
// "serial_run" is not necessarily "sequential_run".
PADDLE_DEFINE_EXPORTED_bool(new_executor_sequential_run,
                            false,
                            "Enable sequential execution for standalone "
                            "executor, only applied to GPU OPs.");
45
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
46
                            false,
47
                            "Use inplace in new executor");
48 49
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                            true,
50 51
                            "Use local_scope in new executor(especially used "
                            "in UT), can turn off for better performance");
52 53 54
PADDLE_DEFINE_EXPORTED_bool(control_flow_use_new_executor,
                            false,
                            "Use new executor in control flow op");
55

56
DECLARE_bool(check_nan_inf);
57
DECLARE_bool(benchmark);
58

59
constexpr const char* kExceptionCaught = "ExceptionCaught";
60
constexpr const char* kTaskCompletion = "TaskCompletion";
61

62 63 64
namespace paddle {
namespace framework {

L
Leo Chen 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
inline void SetDeviceId(const platform::Place& place) {
  // TODO(zhiqiu): reduce the cost
  if (platform::is_gpu_place(place)) {
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
    PADDLE_THROW(platform::errors::Unavailable(
        "Cannot run operator on place %s, please recompile paddle or "
        "reinstall Paddle with CUDA support.",
        place));
#else
    auto dev_id = place.device;
    platform::SetDeviceId(dev_id);
#endif
  } else if (platform::is_xpu_place(place)) {
#ifndef PADDLE_WITH_XPU
    PADDLE_THROW(platform::errors::Unavailable(
        "Cannot run operator on place %s, please recompile paddle or "
        "reinstall Paddle with XPU support.",
        place));
#else
    auto dev_id = place.device;
    platform::SetXPUDeviceId(dev_id);
#endif
  } else if (platform::is_npu_place(place)) {
#ifndef PADDLE_WITH_ASCEND_CL
    PADDLE_THROW(platform::errors::Unavailable(
        "Cannot run operator on place %s, please recompile paddle or "
        "reinstall Paddle with NPU support.",
        place));
#else
    auto dev_id = place.device;
    platform::SetNPUDeviceId(dev_id);
#endif
  } else if (platform::is_custom_place(place)) {
#ifndef PADDLE_WITH_CUSTOM_DEVICE
    PADDLE_THROW(platform::errors::Unavailable(
        "Cannot run operator on place %s, please recompile paddle or "
        "reinstall Paddle with CustomDevice support.",
        place));
#else
    phi::DeviceManager::SetDevice(place);
#endif
  }
}

109
// TODO(Ruibiao): Pass skip_gc_vars, used_for_jit, and other config messages by
110
// constructing an interpreter::ExecutionConfig
111 112
InterpreterCore::InterpreterCore(const platform::Place& place,
                                 const BlockDesc& block,
113
                                 const std::set<std::string>& skip_gc_vars,
114
                                 framework::Scope* scope,
115 116
                                 bool used_for_jit,
                                 bool used_for_control_flow_op)
W
wanghuancoder 已提交
117
    : place_(place),
118
      block_(block),
119
      execution_config_(place, block.OpSize()),
R
Ruibiao Chen 已提交
120 121
      stream_analyzer_(place),
      var_scope_(scope) {
L
Leo Chen 已提交
122
  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
123

L
liutiexing 已提交
124
  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
125
  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
126

127
  execution_config_.used_for_jit = used_for_jit;
128 129 130 131
  execution_config_.used_for_control_flow_op = used_for_control_flow_op;
  execution_config_.create_local_scope = !used_for_jit &&
                                         FLAGS_new_executor_use_local_scope &&
                                         !used_for_control_flow_op;
132 133
  execution_config_.skip_gc_vars = skip_gc_vars;
  execution_config_.Log(/*log_level=*/8);
134

135
  if (execution_config_.create_local_scope) {
136
    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
137 138
    local_scope_ = local_scope;
  }
139
  var_scope_.SetLocalScope(local_scope_);
W
wanghuancoder 已提交
140 141
}

142 143 144
InterpreterCore::~InterpreterCore() {
  // cancle gc's thread
  gc_.reset(nullptr);
145 146
  async_work_queue_.reset();
  VLOG(4) << "~InterpreterCore(): " << this << " on " << place_;
L
Leo Chen 已提交
147 148 149 150 151 152

#ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
  platform::ClearMKLDNNCache(place_, this);
#endif
153 154
}

155 156
interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
157
    const std::vector<phi::DenseTensor>& feed_tensors) {
L
Leo Chen 已提交
158 159
  SetDeviceId(place_);

160 161 162 163 164 165 166 167 168 169
  Prepare(feed_names, feed_tensors, true);
  interpreter::CostInfo cost_info;
  {
    interpreter::ProfilerGuard(place_, &cost_info);

    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
    async_work_queue_ = GetWorkQueue();

L
Leo Chen 已提交
170 171 172 173 174
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

175 176 177 178
    ExecuteInstructionList(vec_instruction_);
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }

L
Leo Chen 已提交
179
  if (HasLocalScope()) {
180 181 182 183
    ClearLoDTensorArrayInLocalScope();
  }

  return cost_info;
184 185
}

W
wanghuancoder 已提交
186
paddle::framework::FetchList InterpreterCore::Run(
187
    const std::vector<std::string>& feed_names,
188
    const std::vector<phi::DenseTensor>& feed_tensors) {
L
Leo Chen 已提交
189
  SetDeviceId(place_);
190

L
Leo Chen 已提交
191 192 193
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
194

195 196
  bool is_build = is_build_;
  Prepare(feed_names, feed_tensors, is_build);
197

198
  if (is_build) {
199 200 201
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
202
    async_work_queue_ = GetWorkQueue();
L
Leo Chen 已提交
203 204 205 206 207 208

    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

209
    ExecuteInstructionList(vec_instruction_);
210
#ifdef PADDLE_WITH_ASCEND_CL
211 212 213 214 215 216 217 218
    if (platform::is_npu_place(place_)) {
      platform::DeviceContextPool::Instance().Get(place_)->Wait();
    }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    if (platform::is_custom_place(place_)) {
      platform::DeviceContextPool::Instance().Get(place_)->Wait();
    }
219
#endif
220
  }
L
Leo Chen 已提交
221
  if (HasLocalScope()) {
222 223 224
    ClearLoDTensorArrayInLocalScope();
  }

W
wanghuancoder 已提交
225
  // return Fetch Tensors
226 227 228 229 230 231
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
232 233
}

234
paddle::framework::FetchList InterpreterCore::Run(
235
    const std::vector<std::string>& feed_names, bool need_fetch) {
L
Leo Chen 已提交
236
  SetDeviceId(place_);
237

L
Leo Chen 已提交
238 239 240
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
241

242
  if (!is_build_) {
243
    LOG_FIRST_N(INFO, 1) << "New Executor is Running.";
L
Leo Chen 已提交
244 245
    paddle::framework::interpreter::BuildVariableScope(
        block_, &var_scope_, HasLocalScope());
246

247
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
L
Leo Chen 已提交
248
    paddle::framework::interpreter::BuildOpFuncList(
249 250 251 252 253
        place_,
        block_,
        execution_config_.skip_gc_vars,
        &op_func_nodes,
        &var_scope_,
254 255
        execution_config_,
        HasLocalScope());
256
    SetFeedVarsInplaceSkip(feed_names);
257 258
    // convert vec func_list to graph
    Convert(&op_func_nodes);
259
    is_build_ = true;
260
  } else {
261 262 263
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
264
    async_work_queue_ = GetWorkQueue();
265

L
Leo Chen 已提交
266 267 268 269 270
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

271
    ExecuteInstructionList(vec_instruction_);
272
#ifdef PADDLE_WITH_ASCEND_CL
273 274 275 276 277 278 279 280
    if (platform::is_npu_place(place_)) {
      platform::DeviceContextPool::Instance().Get(place_)->Wait();
    }
#endif
#ifdef PADDLE_WITH_CUSTOM_DEVICE
    if (platform::is_custom_place(place_)) {
      platform::DeviceContextPool::Instance().Get(place_)->Wait();
    }
281
#endif
282 283
  }

L
Leo Chen 已提交
284
  if (HasLocalScope()) {
285 286
    ClearLoDTensorArrayInLocalScope();
  }
L
Leo Chen 已提交
287

288
  // return Fetch Tensors
L
Leo Chen 已提交
289 290
  Scope* inner_scope =
      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
291
  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
292
  if (fetch_var && need_fetch) {
293 294 295 296
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
297 298
}

299 300 301 302
void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
  copy_program_ = prog;
}

303 304
void InterpreterCore::SetSkipGcVars(const std::set<std::string>& skip_gc_vars) {
  PADDLE_ENFORCE_EQ(
305
      execution_config_.skip_gc_vars.empty(),
306 307
      true,
      platform::errors::PreconditionNotMet(
308 309
          "execution_config_.skip_gc_vars can only be initialized once, now "
          "execution_config_.skip_gc_vars is "
310
          "not empty, do not call SetSkipGcVars method repeatedly."));
311
  execution_config_.skip_gc_vars = skip_gc_vars;
312 313 314 315 316 317 318 319 320 321
}

const VariableScope* InterpreterCore::GetVariableScope() const {
  return &var_scope_;
}

void InterpreterCore::reset_scope(Scope* new_scope) {
  var_scope_.SetScope(new_scope);
  auto& var_list = var_scope_.MutableVarList();
  for (size_t i = 0; i < var_list.size(); i++) {
322 323
    const auto& var_name = var_scope_.GetNameById(i);
    var_list[i] = new_scope->FindVar(var_name);
324
  }
325 326 327 328 329 330 331 332 333
  // The index should assured valid, cause the InterpreterCore may not be fully
  // built, but was still cached and used. For example, see unit test
  // `test_assert.py`, it may exit before `InterpreterCore::Convert`, but still
  // was cached and used by later tests.
  for (size_t i = 0; i < std::min(refs_.size(), var_list.size()); i++) {
    refs_[i]->ResetVariable(var_list[i]);
  }

  for (size_t i = 0; i < vec_instruction_.size(); i++) {
334 335 336 337
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
  }
}

338 339
void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
  async_work_queue_ = src->GetWorkQueue();
340
  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << src.get()
341 342 343
          << ") to InterpreterCore(" << this << ")";
}

344 345
bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(
    const std::vector<std::vector<size_t>>& input_var2op, size_t var_index) {
346
  if (!var_scope_.VarDesc(var_index)) {
347
    return input_var2op.at(var_index).size() == 1;
348 349
  } else {
    int is_input_cnt = 0;
350
    for (auto inst_id : input_var2op.at(var_index)) {
351 352
      OpInOutInfo info;
      info.Build(vec_instruction_.at(inst_id).OpBase());
353
      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
354 355 356 357 358 359 360 361 362
        is_input_cnt++;
      }
    }
    return is_input_cnt == 1;
  }
}

std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
  if (async_work_queue_ == nullptr) {
363 364 365 366
    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
        execution_config_.host_num_threads,
        execution_config_.deivce_num_threads,
        &main_thread_blocker_);
367 368 369 370 371
  }
  return async_work_queue_;
}

void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
L
Leo Chen 已提交
372 373
  Scope* inner_scope =
      HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
374 375 376 377 378 379
  VariableValueMap ins_map;
  for (auto& var_name_item : instr_node->Inputs()) {
    std::vector<Variable*> input_vars;

    input_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
380
      input_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
381 382 383 384 385 386 387 388 389 390
    }
    ins_map.emplace(var_name_item.first, std::move(input_vars));
  }

  VariableValueMap outs_map;
  for (auto& var_name_item : instr_node->Outputs()) {
    std::vector<Variable*> out_vars;

    out_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
391
      out_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
392 393 394 395 396 397 398
    }
    outs_map.emplace(var_name_item.first, std::move(out_vars));
  }

  // set runtime_ctx and infershape_ctx_
  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
                                                        // kernel
L
Leo Chen 已提交
399 400
    Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
                                         : var_scope_.GetMutableScope();
401 402 403 404 405 406 407
    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
  } else {
    instr_node->ResetContext(ins_map, outs_map);
  }
}

void InterpreterCore::BuildInplace() {
408 409 410 411
  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput phi::DenseTensor
  // and a list of Output Tensors which are sliced from the FusedOutput. These
  // outputs sholud not be the outvar of the in-place var-pair since memory
  // reuse between FusedOutput and Output Tensors is assumed. For the following
412 413 414 415 416 417 418 419 420 421 422
  // example:
  // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
  // var1 = sum(var4, var5)
  // ...
  //
  // After running coalesce_tensor_op, var1 is assumed to share the buffer
  // slices from fused_var. However, if sum_op is in-place, then var1 would
  // re-share the buffer with var4 instead of fused_var.
  std::set<std::string> skip_inplace_outvars;
  for (Instruction& instr : vec_instruction_) {
    OperatorBase* op = instr.OpBase();
423
    if (op->Type() == kCoalesceTensor) {
424 425 426 427 428 429
      const std::vector<std::string>& outputs =
          op->OutputVars(/*has_intermediate=*/false);
      skip_inplace_outvars.insert(outputs.begin(), outputs.end());
    }
  }

L
Leo Chen 已提交
430 431
  Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
                                       : var_scope_.GetMutableScope();
432 433 434 435 436 437 438 439 440 441
  std::vector<std::vector<size_t>> input_var2op(var_scope_.VarSize());
  for (Instruction& instr : vec_instruction_) {
    for (auto& item : instr.Inputs()) {
      for (int var_id : item.second) {
        if (var_id != kEmptyVarIndex) {
          input_var2op.at(var_id).push_back(instr.Id());
        }
      }
    }
  }
442

443 444 445 446 447 448 449 450 451 452 453 454 455 456 457
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    auto& instr = vec_instruction_[i];
    auto* op_base = instr.OpBase();
    if (!op_base->Info().infer_inplace_) {
      continue;
    }

    auto in_to_outs = op_base->Info().infer_inplace_(
        platform::is_gpu_place(instr.DeviceContext().GetPlace()));

    auto& inputs = instr.Inputs();
    auto& outputs = instr.Outputs();
    for (auto& pair : in_to_outs) {
      auto iter = inputs.find(pair.first);
      if (iter != inputs.end() && !iter->second.empty()) {
458
        auto in_var_desc = var_scope_.VarDesc(iter->second[0]);
459 460 461
        if (in_var_desc && in_var_desc->Persistable()) {
          continue;
        }
462
        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
463 464
          continue;
        }
465
        if (BuildInplaceCheckVarIsOnlyInput(input_var2op, iter->second[0])) {
466 467
          auto iterout = outputs.find(pair.second);
          if (iterout != outputs.end() && !iterout->second.empty()) {
468 469 470 471
            const std::string& invar_name =
                var_scope_.GetNameById(iter->second[0]);
            const std::string& outvar_name =
                var_scope_.GetNameById(iterout->second[0]);
472 473
            auto invar = local_scope->FindVar(invar_name);
            auto outvar = local_scope->FindVar(outvar_name);
474

475 476
            if (invar && outvar && invar->IsType<phi::DenseTensor>() &&
                outvar->IsType<phi::DenseTensor>() &&
477 478
                skip_inplace_outvars.find(outvar_name) ==
                    skip_inplace_outvars.end()) {
479
              instr.AddInplace(invar, outvar);
480 481
              VLOG(3) << "inplace " << op_base->Type() << " " << invar_name
                      << " -> " << outvar_name;
482 483 484 485
            }
          }
        }
      }
486 487 488 489
    }
  }
}

X
xiongkun 已提交
490
void InterpreterCore::BuildOperatorDependences() {
R
Ruibiao Chen 已提交
491 492 493 494 495
  // analysis the dependences between ops, add next_instr_list to each instr,
  // and set the dependecy_count_
  size_t instr_num = vec_instruction_.size();
  dependecy_count_.resize(instr_num);
  auto downstream_map = dependency_builder_.Build(
496 497
      vec_instruction_,
      /*is_sequential_run=*/FLAGS_new_executor_sequential_run);
X
xiongkun 已提交
498

R
Ruibiao Chen 已提交
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
  for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) {
    Instruction& cur_instr = vec_instruction_[instr_id];
    const std::set<size_t>& next_instr_ids = downstream_map[instr_id];

    if (cur_instr.KernelType() == OpFuncType::kGpuAsync) {
      for (size_t next_instr_id : next_instr_ids) {
        if (vec_instruction_[next_instr_id].KernelType() ==
            OpFuncType::kGpuAsync) {
          cur_instr.AddNextInstrInSameThread(next_instr_id);
        } else {
          cur_instr.AddNextInstrInDifferentThread(next_instr_id);
        }
      }
    } else {
      bool has_instr_in_same_thread = false;
      for (size_t next_instr_id : next_instr_ids) {
        if (!has_instr_in_same_thread &&
            vec_instruction_[next_instr_id].KernelType() !=
                OpFuncType::kGpuAsync) {
          cur_instr.AddNextInstrInSameThread(next_instr_id);
          has_instr_in_same_thread = true;
        } else {
          cur_instr.AddNextInstrInDifferentThread(next_instr_id);
        }
      }
    }

    for (size_t next_instr_id : next_instr_ids) {
      ++dependecy_count_[next_instr_id];
X
xiongkun 已提交
528 529 530 531
    }
  }
}

532 533 534
// At the end of each step, the holder of phi::DenseTensor in LoDTensorArray is
// null. Clear these Tensors and leave LoDTensorArray empty, otherwise an
// exception will occur in the next step
535 536 537 538 539 540 541 542 543 544
void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
  auto vars = local_scope_->LocalVars();
  for (auto var : vars) {
    if (var->IsType<LoDTensorArray>()) {
      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
      lod_tensor_arr->clear();
    }
  }
}

L
Leo Chen 已提交
545 546
void InterpreterCore::Convert(
    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
547
  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
L
Leo Chen 已提交
548 549
  auto nodes = *op_func_nodes;
  auto op_nums = nodes.size();
550 551
  vec_instruction_.reserve(op_nums);
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
L
Leo Chen 已提交
552
    auto& op_func_node = nodes[op_idx];
553
    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
554 555 556 557 558 559
    Priority priority =
        interpreter::IsCommunicationOp(op_func_node.operator_base_->Type())
            ? Priority::kLowest
            : Priority::kNormal;
    vec_instruction_.emplace_back(
        op_idx, std::move(op_func_node), *dev_ctx_, priority);
560
  }
561

562
  BuildOperatorDependences();
563

R
Ruibiao Chen 已提交
564 565
  stream_analyzer_.ConstructEvents(dependency_builder_, &vec_instruction_);

566 567
  // calculate last_live_ops_
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
568
    Instruction& instr = vec_instruction_[op_idx];
569
    OpInOutInfo info;
570 571 572 573 574 575 576 577 578 579 580
    info.Build(instr.OpBase());

    std::set<size_t> gc_check_vars;

    const std::map<std::string, std::vector<int>>& ins = instr.Inputs();
    const std::map<std::string, std::vector<int>>& outs = instr.Outputs();
    std::multimap<std::string, std::vector<int>> ins_and_outs{ins.begin(),
                                                              ins.end()};
    ins_and_outs.insert(outs.begin(), outs.end());

    for (auto& item : ins_and_outs) {
581
      for (auto id : item.second) {
W
wanghuancoder 已提交
582 583 584
        if (id == kEmptyVarIndex) {
          continue;
        }
585
        auto* var_desc = var_scope_.VarDesc(id);
586 587 588 589
        // skip no_need_buffer input vars
        if (var_desc && ins.count(item.first) &&
            !info.IsInArgBufferNeeded(var_desc->Name())) {
          continue;
590 591 592 593
        } else if (!block_.HasVar(var_scope_.GetNameById(id))) {
          VLOG(10) << "[gc_check_inputs] skip gc: "
                   << var_scope_.GetNameById(id);
          continue;
W
wanghuancoder 已提交
594
        }
595
        gc_check_vars.insert(id);
596 597
      }
    }
598 599

    for (auto var_id : gc_check_vars) {
L
Leo Chen 已提交
600 601
      Scope* inner_scope =
          HasLocalScope() ? local_scope_ : var_scope_.GetMutableScope();
602
      paddle::framework::Variable* var =
603
          inner_scope->FindVar(var_scope_.GetNameById(var_id));
604
      if (var->IsType<phi::DenseTensor>() || var->IsType<phi::SelectedRows>() ||
605
          var->IsType<LoDTensorArray>()) {
606
        last_live_ops_[var_id].insert(op_idx);
607
      } else {
608 609
        VLOG(4) << "not clear " << var_scope_.GetNameById(var_id) << " after "
                << instr.OpBase()->Type() << " because its type is "
610 611
                << framework::ToTypeName(var->Type());
      }
612 613
    }
  }
614

615
  // clear the last_live_ops list for all vars in skip_gc_vars
616
  for (const std::string& skip_gc_var : execution_config_.skip_gc_vars) {
617
    int var_id = var_scope_.GetIdByName(skip_gc_var);
618 619 620 621 622 623
    if (var_id != -1) {
      last_live_ops_[var_id].clear();
      VLOG(8) << "Skip gc for var: " << skip_gc_var;
    }
  }

624 625 626 627 628 629 630 631 632 633 634 635 636
  // shrink, find the downstream op that has no other op in the
  // downstream list happens before it
  // For example,
  // b = op1(a)
  // c = op2(a, b)
  // in this case, a is the input of op1 and op2, we only need to check
  // a after op2, because op2 always uses a after op1.
  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
    std::set<size_t> minumum_last_live_ops;
    for (size_t item : last_live_ops_[i]) {
      bool not_before_any = true;
      // find the op that is not executed before any
      for (size_t other_item : last_live_ops_[i]) {
637
        if (dependency_builder_.OpHappensBefore(item, other_item)) {
638 639 640 641 642 643 644 645
          VLOG(8) << "happens_before: " << item << "->" << other_item
                  << ", so skip " << item;
          not_before_any = false;
          break;
        }
      }
      if (not_before_any) {
        VLOG(8) << "last live op of var " << i << " "
646
                << var_scope_.GetNameById(i) << " : " << item << " "
647 648 649 650 651 652 653 654
                << vec_instruction_[item].OpBase()->Type();
        minumum_last_live_ops.insert(item);
        vec_instruction_[item].AddGCCheckVar(i);
      }
    }
    last_live_ops_[i] = minumum_last_live_ops;
    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
  }
655 656

  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
657
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
658
  }
W
wanghuancoder 已提交
659

660
  BuildSkipShareLoDInfo();
L
Leo Chen 已提交
661

662
  bool inplaced = false;
R
Ruibiao Chen 已提交
663
  for (const Instruction& inst : vec_instruction_) {
664 665 666 667 668 669
    if (inst.OpBase()->Type() == "share_buffer" ||
        inst.OpBase()->Type() == "share_data") {
      VLOG(4) << "Already inplaced, skip inplace now.";
      inplaced = true;
    }
  }
670

671
  if (FLAGS_new_executor_use_inplace && !inplaced) {
672 673 674
    BuildInplace();
  }

675 676 677 678 679 680 681
  for (auto& dep : dependecy_count_) {
    deps_.emplace_back(std::make_shared<interpreter::OpDepInfo>(dep));
  }
  for (size_t i = 0; i < vec_meta_info.size(); ++i) {
    refs_.emplace_back(std::make_shared<interpreter::VarRefInfo>(
        vec_meta_info[i].var_ref_count_, var_scope_.VarRef(i)));
  }
682 683
}

684 685 686
void InterpreterCore::BuildSkipShareLoDInfo() {
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    bool can_skip_lod = true;
687
    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
688
      for (auto& var : input.second) {
689 690
        if (var->IsType<phi::DenseTensor>()) {
          if (var->Get<phi::DenseTensor>().lod().size() != 0) {
691 692 693 694 695 696 697 698 699
            can_skip_lod = false;
            break;
          }
        } else {
          can_skip_lod = false;
          break;
        }
      }
    }
700
    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
701 702 703
  }
}

704
void InterpreterCore::RunInstruction(const Instruction& instr_node) {
705 706
  auto* op = instr_node.OpBase();
  auto place = instr_node.DeviceContext().GetPlace();
L
Leo Chen 已提交
707 708
  Scope* local_scope = HasLocalScope() ? var_scope_.GetMutableLocalScope()
                                       : var_scope_.GetMutableScope();
709
  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);
L
Leo Chen 已提交
710 711 712

  SetDeviceId(place);

713
#ifdef PADDLE_WITH_ASCEND_CL
714
  if (platform::is_npu_place(place)) {
715 716 717
    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the
    // variable values, but only through special `float_status` to checks
    // whether the operation is overflow. More about `float_status`, see:
718 719 720 721
    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
    if (FLAGS_check_nan_inf) {
      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
    }
722 723 724
  }
#endif

725
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
726
  {
727
    // If it is OperatorBase, InferShape do nothing.
728 729
    if (op_with_kernel != nullptr) {
      platform::RecordEvent infershape_event(
730 731 732
          "infer_shape",
          platform::TracerEventType::OperatorInner,
          1,
733
          platform::EventRole::kInnerOp);
734 735 736 737 738 739 740

      // see OperatorWithKernel::RunImpl in operator.cc for why
      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
        op_with_kernel->Info().infer_shape_(
            instr_node.InnerInferShapeContext().get());
      }
741 742 743 744
      infershape_event.End();
      platform::RecordOpInfoSupplement(op->Type(),
                                       op->Attrs(),
                                       *(instr_node.InnerInferShapeContext()),
745 746
                                       *(instr_node.InnerRuntimeContext()),
                                       op->Id());
747
    }
748
  }
749 750
  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
    // TODO(xiongkun03) Does operator base support inplace ?
751
    for (auto& pair : instr_node.InplaceInfo()) {
752 753 754 755 756 757 758 759
      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
      auto* out =
          paddle::framework::details::GetMutableTensorFromVar(pair.second);
      if (in.dims() == out->dims()) {
        out->ShareBufferWith(in);
      }
    }
  }
760

761
  {
762
    platform::RecordEvent compute_event(
763 764 765
        "compute",
        platform::TracerEventType::OperatorInner,
        1,
766
        platform::EventRole::kInnerOp);
767 768 769
    if (op_with_kernel == nullptr) {
      instr_node.OpBase()->Run(*local_scope, place_);
    } else {
770 771 772
      // fit for phi
      if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) {
        VLOG(4) << "Run phi kernel: " << op->Type();
773 774
        VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                << &instr_node.DeviceContext();
775
        phi::KernelContext phi_kernel_context;
776
        op_with_kernel->BuildPhiKernelContext(
777
            *instr_node.InnerRuntimeContext().get(),
778
            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
779
            &phi_kernel_context);
780

781
        (*instr_node.PhiKernel())(&phi_kernel_context);
782 783 784 785

      } else {
        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
      }
786
    }
787
  }
788

789
  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope);
790

791
  if (!instr_node.InplaceBackMap().empty()) {
L
liutiexing 已提交
792 793
    platform::RecordEvent inplaceback_event(
        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
794 795 796 797
    auto& m = instr_node.InplaceBackMap();
    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
    for (auto& p : m) {
      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
798
          var_scope_.VarRef(p.first));
799
      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
800
          var_scope_.VarRef(p.second));
801 802
      original_tensor->ShareDataWith(*transformed_tensor);
      VLOG(4) << "Transfer inplace variable back form "
803 804
              << var_scope_.GetNameById(p.first) << " to "
              << var_scope_.GetNameById(p.second);
805 806 807
    }
  }

808 809 810
  /*For profiling/benchmark only*/
  if (FLAGS_benchmark) {
    instr_node.DeviceContext().Wait();
811 812
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
813 814 815 816 817
    VLOG(4) << "Operator(" << op->Type()
            << "): context wait and get last error";
#endif
  }

818
  // for debug nan/inf
819
  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
820
    VLOG(4) << "Check nan/inf";
821
    framework::details::CheckOpHasNanOrInf(
822
        *op,
823
        *local_scope_,
824
        place);  // TODO(xiongkun03) change it to inner scope.
825
  }
826 827 828
}

void InterpreterCore::ExecuteInstructionList(
829
    const std::vector<Instruction>& vec_instr) {
830
  interpreter::ResetAtomicGuard guard(&deps_, &refs_);
L
Leo Chen 已提交
831 832
  unfinished_op_number_ = vec_instr.size();
  if (unfinished_op_number_ == 0) {
833 834 835 836
    VLOG(4) << "No op to run, return";
    return;
  }

837 838
  exception_holder_.Clear();

839 840
  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
    if (dependecy_count_[i] == 0) {
841
      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
842
                                 [this, i] { RunInstructionAsync(i); });
843 844 845
    }
  }

846
  auto event_name = main_thread_blocker_.WaitEvent();
847 848
  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
          << ") got event_name: " << event_name;
849

850
  if (UNLIKELY(exception_holder_.IsCaught())) {
851
    VLOG(1) << "Exception caught " << exception_holder_.Type();
852 853 854 855 856
    // Graceful exit when the executor encountered a fatal error.
    // EOF is not a fatal error.
    if (exception_holder_.Type() != "EOF") {
      async_work_queue_->Cancel();
    }
857
    VLOG(4) << "Cancel ok";
858
    PADDLE_ENFORCE_EQ(
859 860
        main_thread_blocker_.Clear(),
        0,
861 862
        platform::errors::PreconditionNotMet(
            "main_thread_blocker_.Clear() return -1, clear failed"));
863
    VLOG(4) << "clear ok";
864 865
    exception_holder_.ReThrow();
  }
866
}
867

L
liutiexing 已提交
868
void InterpreterCore::RunNextInstructions(
869
    const Instruction& instr, std::deque<size_t>* reserved_next_ops) {
870 871
  platform::RecordEvent record(
      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
872

873 874 875 876
  auto IsReady = [this](size_t next_id) {
    VLOG(4) << "op_id: " << next_id
            << ", remain deps: " << deps_[next_id]->DynamicDep();
    return deps_[next_id]->CheckAndDecrease();
877 878
  };

R
Ruibiao Chen 已提交
879 880 881 882 883
  for (size_t next_instr_id : instr.NextInstrsInDifferenceThread()) {
    if (IsReady(next_instr_id)) {
      async_work_queue_->AddTask(
          vec_instruction_[next_instr_id].KernelType(),
          [this, next_instr_id]() { RunInstructionAsync(next_instr_id); });
884
    }
R
Ruibiao Chen 已提交
885
  }
L
Leo Chen 已提交
886

R
Ruibiao Chen 已提交
887 888 889 890 891 892
  for (size_t next_instr_id : instr.NextInstrsInSameThread()) {
    if (IsReady(next_instr_id)) {
      if (vec_instruction_[next_instr_id].GetPriority() == Priority::kLowest) {
        reserved_next_ops->push_back(next_instr_id);
      } else {
        reserved_next_ops->push_front(next_instr_id);
893 894 895 896 897
      }
    }
  }
}

898
void InterpreterCore::RunInstructionAsync(size_t instr_id) {
899 900
  std::deque<size_t> ready_ops;
  ready_ops.push_back(instr_id);
L
liutiexing 已提交
901 902
  while (!ready_ops.empty()) {
    instr_id = ready_ops.front();
903
    ready_ops.pop_front();
904
    auto& instr_node = vec_instruction_.at(instr_id);
L
liutiexing 已提交
905
    VLOG(5) << __func__ << " OP id:" << instr_node.Id()
906
            << " name:" << instr_node.OpBase()->Type() << " type:"
R
Ruibiao Chen 已提交
907 908 909 910 911
            << (instr_node.KernelType() == OpFuncType::kCpuSync
                    ? "kCpuSync"
                    : (instr_node.KernelType() == OpFuncType::kGpuSync
                           ? "kGpuSync"
                           : "kGpuAsync"))
L
liutiexing 已提交
912 913
            << " runs on " << platform::GetCurrentThreadName();

914
    auto* op = instr_node.OpBase();
L
liutiexing 已提交
915 916
    platform::RecordEvent instruction_event(
        op->Type(), platform::TracerEventType::Operator, 1);
917

918
    try {
R
Ruibiao Chen 已提交
919
      instr_node.WaitEvent(place_);
920

921 922 923 924 925
      if (!instr_node.IsArtificial()) {
        RunInstruction(instr_node);
        CheckGC(instr_node);
        interpreter::LogDeviceMemoryStats(place_);
      }
926

R
Ruibiao Chen 已提交
927
      instr_node.RecordEvent(place_);
928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949
    } catch (platform::EnforceNotMet& ex) {
      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
      exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
    } catch (platform::EOFException&) {
      exception_holder_.Catch(std::current_exception());
    } catch (std::exception& ex) {
      LOG(WARNING) << op->Type() << " raises an exception "
                   << platform::demangle(typeid(ex).name()) << ", "
                   << ex.what();
      exception_holder_.Catch(std::current_exception());
    } catch (...) {
      LOG(WARNING) << op->Type() << " raises an unknown exception";
      exception_holder_.Catch(std::current_exception());
    }

    if (UNLIKELY(exception_holder_.IsCaught())) {
      VLOG(4) << "Exception caught";
      if (exception_notifier_ != nullptr) {
        exception_notifier_->NotifyEvent();
      }
      return;
    }
950

L
Leo Chen 已提交
951 952 953
    VLOG(4) << "unfinished_op_number_: " << unfinished_op_number_;
    if (UNLIKELY(unfinished_op_number_.fetch_sub(
                     1, std::memory_order_relaxed) == 1)) {
954 955 956 957 958
      if (completion_notifier_ != nullptr) {
        completion_notifier_->NotifyEvent();
      }
    }

959
    RunNextInstructions(instr_node, &ready_ops);
L
liutiexing 已提交
960
  }
961 962
}

963
void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
L
Leo Chen 已提交
964 965 966 967
#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
  PADDLE_THROW(platform::errors::Unimplemented(
      "RecordStreamForGC is only implemented when compiled with GPU."));
#else
968
  if (!IsInterpretercoreFastGCEnabled() ||
R
Ruibiao Chen 已提交
969
      instr.KernelType() != OpFuncType::kGpuAsync) {
970 971
    return;
  }
972 973
  platform::RecordEvent record(
      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
974

L
Leo Chen 已提交
975 976
  gpuStream_t stream =
      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
977
  auto TensorRecordStream = [&stream](phi::DenseTensor& tensor) {
978 979 980 981 982 983 984 985 986
    auto allocation = tensor.Holder();
    if (allocation == nullptr) {
      return;
    }

    const platform::Place& place = allocation->place();
    if (platform::is_gpu_place(place)) {
      memory::RecordStream(allocation, stream);
    } else if (platform::is_cuda_pinned_place(place)) {
987 988 989
      // TODO(Ruibiao): Here should do something to make sure that the tensor
      // is not freed until the H2D copies done. However, simplely launch a
      // CUDA runtime callback to the H2D stream may lead a high performance
990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005
      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
      // present, we just log a WARNING here. A better design is required.
      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
                      "manner may lead a data inconsistent";
    } else {
      // memory copies involve CPUPlace are always synchronous, so just do
      // nothing here
    }
  };

  /* NOTE(Ruibiao):Cross-stream tensor synchronization is required only when
   * all the following conditions are satisfied:
   * 1. The tensor will be GC after running the instruction, i.e., in
   * instr.GCCheckVars.
   * 2. The stream which initializes this tensor is different from the stream
   * which the instruction run in.
1006 1007
   * 3. The tensor is the instruction's input, cause we assume that
   * instruction will initialize all output tensors with its running stream.
1008 1009 1010 1011 1012 1013 1014 1015 1016
   * 4. In the OP function of this instruction, the tensor is an input of a
   * async CUDA kernel.
   *
   * Here we only process the first condition, because:
   * 1. Since the RecordStream function will directly return when the recored
   * stream is equal to the owning stream, recording a stream same as which
   * initialized this tensor has less time overhead. Conversely, it may take
   * more time if we try to extract those cross-stream input vars from
   * instr.GCCheckVars.
1017 1018
   * 2. Now the instruction has no idea of which vars involving async running
   * in OP function, and thus we can not recognize condition 4. It should be
1019 1020 1021
   * supported later.
   */
  for (int var_id : instr.GCCheckVars()) {
1022 1023
    VLOG(4) << "GC sync " << var_scope_.GetNameById(var_id) << " "
            << var_scope_.VarDesc(var_id);
1024 1025

    // persistable var will be ignore while GC
1026 1027
    if (var_scope_.VarDesc(var_id) &&
        var_scope_.VarDesc(var_id)->Persistable()) {
1028 1029 1030
      continue;
    }

1031
    paddle::framework::Variable* var = var_scope_.VarRef(var_id);
1032 1033 1034 1035
    if (var == nullptr) {
      continue;
    }

1036 1037
    if (var->IsType<phi::DenseTensor>()) {
      TensorRecordStream(*(var->GetMutable<phi::DenseTensor>()));
1038 1039 1040 1041
    } else if (var->IsType<
                   operators::reader::
                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
      // do nothing
1042
    } else if (var->IsType<phi::SelectedRows>()) {
1043
      TensorRecordStream(
1044
          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058
    } else if (var->IsType<LoDTensorArray>()) {
      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
      for (auto& tensor : *tensor_arr) {
        TensorRecordStream(tensor);
      }
    } else if (var->IsType<std::vector<Scope*>>()) {
      // do nothing
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "The variable(%s) is not supported in eager deletion.",
          framework::ToTypeName(var->Type())));
    }
  }
#endif
L
Leo Chen 已提交
1059
}
1060

1061
void InterpreterCore::CheckGC(const Instruction& instr) {
1062 1063
  platform::RecordEvent record(
      "CheckGC", platform::TracerEventType::UserDefined, 10);
1064 1065 1066
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  RecordStreamForGC(instr);
#endif
1067
  auto& var_scope = var_scope_;
1068

1069
  for (auto var_id : instr.GCCheckVars()) {
1070 1071 1072
    VLOG(4) << "GC:" << var_scope_.GetNameById(var_id) << ", id:" << var_id
            << ", ref:" << refs_[var_id]->DynamicRef();
    bool is_ready = refs_[var_id]->CheckAndDecrease();
1073 1074 1075 1076 1077
    // ignore all persistable var while GC
    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
      continue;
    }
    if (is_ready) {
X
xiongkun 已提交
1078 1079
      VLOG(6) << "Async delete variable with name : "
              << var_scope.GetNameById(var_id);
1080
      gc_->Add(refs_[var_id]->Var(), instr);
W
wanghuancoder 已提交
1081 1082 1083 1084
    }
  }
}

1085 1086 1087
void InterpreterCore::Prepare(const std::vector<std::string>& feed_names,
                              const std::vector<phi::DenseTensor>& feed_tensors,
                              bool prepare_feed) {
1088 1089
  PADDLE_ENFORCE_EQ(feed_names.size(),
                    feed_tensors.size(),
1090 1091 1092
                    platform::errors::PreconditionNotMet(
                        "Required feed_names.size() == feed_tensors.size(), "
                        "but received %d != %d",
1093 1094
                        feed_names.size(),
                        feed_tensors.size()));
1095
  auto FeedInput = [&] {
1096
    VLOG(4) << "Feed inputs";
1097
    for (size_t i = 0; i < feed_names.size(); ++i) {
1098
      auto* feed_var = local_scope_->FindVar(feed_names[i]);
1099
      PADDLE_ENFORCE_NOT_NULL(
1100 1101 1102
          feed_var,
          platform::errors::NotFound("Variable %s should not be nullptr.",
                                     feed_names[i]));
1103

1104
      auto feed_tensor = feed_var->GetMutable<phi::DenseTensor>();
1105
      feed_tensor->ShareDataWith(feed_tensors[i]);
1106
      feed_tensor->set_lod(feed_tensors[i].lod());
1107 1108 1109
    }
  };

1110
  if (!is_build_) {
L
Leo Chen 已提交
1111 1112
    paddle::framework::interpreter::BuildVariableScope(
        block_, &var_scope_, HasLocalScope());
1113
    FeedInput();
L
Leo Chen 已提交
1114
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
L
Leo Chen 已提交
1115
    paddle::framework::interpreter::BuildOpFuncList(
1116 1117 1118 1119 1120
        place_,
        block_,
        execution_config_.skip_gc_vars,
        &op_func_nodes,
        &var_scope_,
1121 1122
        execution_config_,
        HasLocalScope());
1123
    SetFeedVarsInplaceSkip(feed_names);
1124
    // convert vec func_list to graph
L
Leo Chen 已提交
1125
    Convert(&op_func_nodes);
1126
    is_build_ = true;
1127
  }
W
wanghuancoder 已提交
1128
  // NOTE: Because feed_tensor will be GC after
L
Leo Chen 已提交
1129
  // paddle::framework::BuildOpFuncList, so we should
1130
  // call FeedInput again.
1131 1132 1133
  if (prepare_feed) {
    FeedInput();
  }
1134 1135
}

1136 1137 1138
void InterpreterCore::SetFeedVarsInplaceSkip(
    const std::vector<std::string>& feed_names) {
  for (auto& feed_name : feed_names) {
1139
    var_scope_.SetVarSikpInplace(feed_name, true);
1140 1141 1142
  }
}

L
Leo Chen 已提交
1143 1144
bool InterpreterCore::HasLocalScope() const { return local_scope_ != nullptr; }

1145
std::shared_ptr<InterpreterCore> CreateInterpreterCore(
1146 1147
    const platform::Place& place,
    const ProgramDesc& prog,
1148
    Scope* scope,
1149
    const std::vector<std::string>& fetch_names,
1150 1151
    const std::set<std::string>& skip_gc_vars) {
  std::shared_ptr<InterpreterCore> core = nullptr;
L
Leo Chen 已提交
1152
  // NOTE(Aurelius84): `AddFetch` will modify BlockDesc, so we should copy
1153 1154 1155
  // a new program.
  auto new_prog = std::make_shared<framework::ProgramDesc>(prog);
  auto* block = new_prog->MutableBlock(0);
L
Leo Chen 已提交
1156
  interpreter::AddFetch(fetch_names, block);
1157

1158
  core = std::make_shared<InterpreterCore>(place, *block, skip_gc_vars, scope);
1159 1160 1161 1162
  core->SetCopyProgram(new_prog);
  return core;
}

1163 1164
}  // namespace framework
}  // namespace paddle