interpretercore.cc 39.5 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14

15
#include "paddle/fluid/framework/new_executor/interpretercore.h"
16

17
#include <unordered_set>
18

19
#include "paddle/fluid/framework/details/nan_inf_utils.h"
20
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
21 22
#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
#include "paddle/fluid/framework/operator.h"
23
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
L
liutiexing 已提交
24 25
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
26
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
27
#include "paddle/phi/common/place.h"
28
#include "paddle/phi/core/kernel_context.h"
L
Leo Chen 已提交
29 30 31
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
32

33
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
34
                            false,
35
                            "Use inplace in new executor");
36 37
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                            true,
38 39
                            "Use local_scope in new executor(especially used "
                            "in UT), can turn off for better performance");
40

41
DECLARE_bool(check_nan_inf);
42
DECLARE_bool(benchmark);
43

44
constexpr const char* kExceptionCaught = "ExceptionCaught";
45
constexpr const char* kTaskCompletion = "TaskCompletion";
46

47 48
namespace paddle {
namespace framework {
49 50
// NOTE(Aurelius84): Need a better strategy to determine it.
static constexpr size_t kHostNumThreads = 4;
51
static constexpr size_t kDeviceNumThreads = 1;
52

53 54
InterpreterCore::InterpreterCore(const platform::Place& place,
                                 const BlockDesc& block,
55
                                 const std::set<std::string>& skip_gc_vars,
56 57
                                 framework::Scope* scope,
                                 bool used_for_jit)
W
wanghuancoder 已提交
58
    : place_(place),
59
      block_(block),
60
      skip_gc_vars_(skip_gc_vars),
61
      var_scope_(scope),
62 63
      stream_analyzer_(place),
      used_for_jit_(used_for_jit) {
L
Leo Chen 已提交
64
  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
65

66
  is_build_ = false;
67

L
liutiexing 已提交
68
  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
69
  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
70

71
  create_local_scope_ = FLAGS_new_executor_use_local_scope;
72 73 74 75

  if (used_for_jit_) {
    create_local_scope_ = false;
  }
76 77 78 79
  VLOG(4) << "create_local_scope_ is " << create_local_scope_;

  if (create_local_scope_) {
    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
80 81
    local_scope_ = local_scope;
  }
82
  var_scope_.SetLocalScope(local_scope_);
83

W
wanghuancoder 已提交
84 85 86 87 88 89 90
  // prune

  // optmize graph pass

  // convert to run graph
}

91 92 93
InterpreterCore::~InterpreterCore() {
  // cancle gc's thread
  gc_.reset(nullptr);
94 95
  async_work_queue_.reset();
  VLOG(4) << "~InterpreterCore(): " << this << " on " << place_;
L
Leo Chen 已提交
96 97 98 99 100 101

#ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
  platform::ClearMKLDNNCache(place_, this);
#endif
102 103
}

104 105 106
interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
107
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
108 109 110 111
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
112 113 114 115 116 117 118 119 120 121
  Prepare(feed_names, feed_tensors, true);
  interpreter::CostInfo cost_info;
  {
    interpreter::ProfilerGuard(place_, &cost_info);

    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
    async_work_queue_ = GetWorkQueue();

L
Leo Chen 已提交
122 123 124 125 126
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

127 128 129 130 131 132 133 134 135
    ExecuteInstructionList(vec_instruction_);
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }

  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

  return cost_info;
136 137
}

W
wanghuancoder 已提交
138
paddle::framework::FetchList InterpreterCore::Run(
139
    const std::vector<std::string>& feed_names,
140
    const std::vector<framework::LoDTensor>& feed_tensors) {
141
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
142 143 144 145
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
146

L
Leo Chen 已提交
147 148 149
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
150

151 152
  bool is_build = is_build_;
  Prepare(feed_names, feed_tensors, is_build);
153

154
  if (is_build) {
155 156 157
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
158
    async_work_queue_ = GetWorkQueue();
L
Leo Chen 已提交
159 160 161 162 163 164

    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

165
    ExecuteInstructionList(vec_instruction_);
166 167 168
#ifdef PADDLE_WITH_ASCEND_CL
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif
169
  }
170 171 172 173
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

W
wanghuancoder 已提交
174
  // return Fetch Tensors
175 176 177 178 179 180
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
181 182
}

183 184
paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names) {
185
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
186 187 188 189
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
190

L
Leo Chen 已提交
191 192 193
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
194

195
  if (!is_build_) {
196 197
    paddle::framework::interpreter::build_variable_scope(
        block_, &var_scope_, create_local_scope_);
198

199
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
200 201 202 203
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
204
                                                       &var_scope_,
205 206
                                                       create_local_scope_,
                                                       used_for_jit_);
207
    is_build_ = true;
208
    SetFeedVarsInplaceSkip(feed_names);
209 210 211
    // convert vec func_list to graph
    Convert(&op_func_nodes);
  } else {
212 213 214
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
215
    async_work_queue_ = GetWorkQueue();
216

L
Leo Chen 已提交
217 218 219 220 221
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

222
    ExecuteInstructionList(vec_instruction_);
223 224 225
#ifdef PADDLE_WITH_ASCEND_CL
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif
226 227
  }

228 229 230
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }
231
  // return Fetch Tensors
232 233 234
  Scope* inner_scope =
      create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
235 236 237 238 239
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
240 241
}

242 243 244 245
void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
  copy_program_ = prog;
}

246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
void InterpreterCore::SetSkipGcVars(const std::set<std::string>& skip_gc_vars) {
  PADDLE_ENFORCE_EQ(
      skip_gc_vars_.empty(),
      true,
      platform::errors::PreconditionNotMet(
          "Skip_gc_vars_ can only be initialized once, now skip_gc_vars_ is "
          "not empty, do not call SetSkipGcVars method repeatedly."));
  skip_gc_vars_ = skip_gc_vars;
}

const VariableScope* InterpreterCore::GetVariableScope() const {
  return &var_scope_;
}

void InterpreterCore::reset_scope(Scope* new_scope) {
  var_scope_.SetScope(new_scope);
  auto& var_list = var_scope_.MutableVarList();
  for (size_t i = 0; i < var_list.size(); i++) {
    var_list[i] = new_scope->FindVar(var_scope_.GetNameById(i));
  }
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
  }
}

271 272 273 274 275 276 277
void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
  async_work_queue_ = src->GetWorkQueue();
  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << &src
          << ") to InterpreterCore(" << this << ")";
}

bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
278
  if (!var_scope_.VarDesc(var_index)) {
279 280 281 282 283 284
    return input_var2op_info_.at(var_index).size() == 1;
  } else {
    int is_input_cnt = 0;
    for (auto inst_id : input_var2op_info_.at(var_index)) {
      OpInOutInfo info;
      info.Build(vec_instruction_.at(inst_id).OpBase());
285
      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301
        is_input_cnt++;
      }
    }
    return is_input_cnt == 1;
  }
}

std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
  if (async_work_queue_ == nullptr) {
    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
        kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
  }
  return async_work_queue_;
}

void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
302 303
  Scope* inner_scope =
      create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
304 305 306 307 308 309
  VariableValueMap ins_map;
  for (auto& var_name_item : instr_node->Inputs()) {
    std::vector<Variable*> input_vars;

    input_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
310
      input_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
311 312 313 314 315 316 317 318 319 320
    }
    ins_map.emplace(var_name_item.first, std::move(input_vars));
  }

  VariableValueMap outs_map;
  for (auto& var_name_item : instr_node->Outputs()) {
    std::vector<Variable*> out_vars;

    out_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
321
      out_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
322 323 324 325 326 327 328
    }
    outs_map.emplace(var_name_item.first, std::move(out_vars));
  }

  // set runtime_ctx and infershape_ctx_
  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
                                                        // kernel
329 330
    Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                             : var_scope_.GetMutableScope();
331 332 333 334 335 336 337
    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
  } else {
    instr_node->ResetContext(ins_map, outs_map);
  }
}

void InterpreterCore::BuildInplace() {
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359
  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput Tensor and a list
  // of Output Tensors which are sliced from the FusedOutput. These outputs
  // sholud not be the outvar of the in-place var-pair since memory reuse
  // between FusedOutput and Output Tensors is assumed. For the following
  // example:
  // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
  // var1 = sum(var4, var5)
  // ...
  //
  // After running coalesce_tensor_op, var1 is assumed to share the buffer
  // slices from fused_var. However, if sum_op is in-place, then var1 would
  // re-share the buffer with var4 instead of fused_var.
  std::set<std::string> skip_inplace_outvars;
  for (Instruction& instr : vec_instruction_) {
    OperatorBase* op = instr.OpBase();
    if (op->Type() == "coalesce_tensor") {
      const std::vector<std::string>& outputs =
          op->OutputVars(/*has_intermediate=*/false);
      skip_inplace_outvars.insert(outputs.begin(), outputs.end());
    }
  }

360 361 362
  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                           : var_scope_.GetMutableScope();

363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    auto& instr = vec_instruction_[i];
    auto* op_base = instr.OpBase();
    if (!op_base->Info().infer_inplace_) {
      continue;
    }

    auto in_to_outs = op_base->Info().infer_inplace_(
        platform::is_gpu_place(instr.DeviceContext().GetPlace()));

    auto& inputs = instr.Inputs();
    auto& outputs = instr.Outputs();
    for (auto& pair : in_to_outs) {
      auto iter = inputs.find(pair.first);
      if (iter != inputs.end() && !iter->second.empty()) {
378
        auto in_var_desc = var_scope_.VarDesc(iter->second[0]);
379 380 381
        if (in_var_desc && in_var_desc->Persistable()) {
          continue;
        }
382
        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
383 384 385 386 387
          continue;
        }
        if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
          auto iterout = outputs.find(pair.second);
          if (iterout != outputs.end() && !iterout->second.empty()) {
388 389 390 391
            const std::string& invar_name =
                var_scope_.GetNameById(iter->second[0]);
            const std::string& outvar_name =
                var_scope_.GetNameById(iterout->second[0]);
392 393
            auto invar = local_scope->FindVar(invar_name);
            auto outvar = local_scope->FindVar(outvar_name);
394

395
            if (invar && outvar && invar->IsType<LoDTensor>() &&
396 397 398
                outvar->IsType<LoDTensor>() &&
                skip_inplace_outvars.find(outvar_name) ==
                    skip_inplace_outvars.end()) {
399
              instr.AddInplace(invar, outvar);
400 401
              VLOG(3) << "inplace " << op_base->Type() << " " << invar_name
                      << " -> " << outvar_name;
402 403 404 405
            }
          }
        }
      }
406 407 408 409
    }
  }
}

X
xiongkun 已提交
410 411 412 413 414
void InterpreterCore::BuildOperatorDependences() {
  // analysis the dependences between ops, set the dependecy_count_ and Call
  // Schedule
  auto op_nums = vec_instruction_.size();
  dependecy_count_.resize(op_nums);
415
  auto op2downstream = dependency_builder_.Build(vec_instruction_);
X
xiongkun 已提交
416 417 418 419 420 421 422 423 424 425 426
  for (size_t op = 0; op < vec_instruction_.size(); ++op) {
    auto op_list = op2downstream[op];
    std::vector<size_t> downsteam_vector(op_list.begin(), op_list.end());
    stream_analyzer_.Schedule(downsteam_vector, &vec_instruction_, op);

    for (auto inst_id : op_list) {
      dependecy_count_[inst_id]++;
    }
  }
}

427 428 429 430 431 432 433 434 435 436 437 438 439
// At the end of each step, the holder of Tensor in LoDTensorArray is null.
// Clear these Tensors and leave LoDTensorArray empty, otherwise an exception
// will occur in the next step
void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
  auto vars = local_scope_->LocalVars();
  for (auto var : vars) {
    if (var->IsType<LoDTensorArray>()) {
      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
      lod_tensor_arr->clear();
    }
  }
}

L
Leo Chen 已提交
440 441
void InterpreterCore::Convert(
    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
442 443
  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
  auto var_nums = var_scope_.VarSize();
444
  input_var2op_info_.resize(var_nums);
L
Leo Chen 已提交
445
  auto nodes = *op_func_nodes;
446

L
Leo Chen 已提交
447
  auto op_nums = nodes.size();
448 449
  vec_instruction_.reserve(op_nums);
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
L
Leo Chen 已提交
450
    auto& op_func_node = nodes[op_idx];
451
    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
L
Leo Chen 已提交
452
    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
453 454 455 456 457
  }
  BuildOperatorDependences();
  // calculate last_live_ops_
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
    auto& instr = vec_instruction_[op_idx];
458
    OpInOutInfo info;
459 460
    std::set<size_t> gc_check_inputs;
    for (auto& item : instr.Inputs()) {
461
      for (auto id : item.second) {
W
wanghuancoder 已提交
462 463 464
        if (id == kEmptyVarIndex) {
          continue;
        }
465
        input_var2op_info_.at(id).push_back(op_idx);
W
wanghuancoder 已提交
466 467
        // var can be gc-ed
        if (!info.IsBuilt()) {
468
          info.Build(instr.OpBase());
W
wanghuancoder 已提交
469
        }
470
        auto* var_desc = var_scope_.VarDesc(id);
471 472
        if (var_desc) {
          if (info.IsInArgBufferNeeded(var_desc->Name())) {
473
            gc_check_inputs.insert(id);
W
wanghuancoder 已提交
474 475
          }
        } else {
476
          gc_check_inputs.insert(id);
W
wanghuancoder 已提交
477
        }
478 479
      }
    }
480
    for (auto var_id : gc_check_inputs) {
481 482
      Scope* inner_scope =
          create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
483
      paddle::framework::Variable* var =
484
          inner_scope->FindVar(var_scope_.GetNameById(var_id));
485 486
      if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
          var->IsType<LoDTensorArray>()) {
487
        last_live_ops_[var_id].insert(op_idx);
488
      } else {
489 490
        VLOG(4) << "not clear " << var_scope_.GetNameById(var_id) << " after "
                << instr.OpBase()->Type() << " because its type is "
491 492
                << framework::ToTypeName(var->Type());
      }
493 494
    }
  }
W
wanghuancoder 已提交
495
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
496
    // checkout output
497
    for (auto& item : vec_instruction_[i].Outputs()) {
498 499 500
      for (auto var_id : item.second) {
        if (input_var2op_info_.at(var_id).size() == 0) {
          last_live_ops_[var_id].insert(i);
W
wanghuancoder 已提交
501 502 503 504
        }
      }
    }
  }
505 506
  // clear the last_live_ops list for all vars in skip_gc_vars
  for (const std::string& skip_gc_var : skip_gc_vars_) {
507
    int var_id = var_scope_.GetIdByName(skip_gc_var);
508 509 510 511 512 513
    if (var_id != -1) {
      last_live_ops_[var_id].clear();
      VLOG(8) << "Skip gc for var: " << skip_gc_var;
    }
  }

514 515 516 517 518 519 520 521 522 523 524 525 526
  // shrink, find the downstream op that has no other op in the
  // downstream list happens before it
  // For example,
  // b = op1(a)
  // c = op2(a, b)
  // in this case, a is the input of op1 and op2, we only need to check
  // a after op2, because op2 always uses a after op1.
  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
    std::set<size_t> minumum_last_live_ops;
    for (size_t item : last_live_ops_[i]) {
      bool not_before_any = true;
      // find the op that is not executed before any
      for (size_t other_item : last_live_ops_[i]) {
527
        if (dependency_builder_.OpHappensBefore(item, other_item)) {
528 529 530 531 532 533 534 535
          VLOG(8) << "happens_before: " << item << "->" << other_item
                  << ", so skip " << item;
          not_before_any = false;
          break;
        }
      }
      if (not_before_any) {
        VLOG(8) << "last live op of var " << i << " "
536
                << var_scope_.GetNameById(i) << " : " << item << " "
537 538 539 540 541 542 543 544
                << vec_instruction_[item].OpBase()->Type();
        minumum_last_live_ops.insert(item);
        vec_instruction_[item].AddGCCheckVar(i);
      }
    }
    last_live_ops_[i] = minumum_last_live_ops;
    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
  }
545 546

  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
547
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
548
  }
W
wanghuancoder 已提交
549

550
  BuildSkipShareLoDInfo();
L
Leo Chen 已提交
551

552 553 554 555 556 557 558 559
  bool inplaced = false;
  for (auto inst : vec_instruction_) {
    if (inst.OpBase()->Type() == "share_buffer" ||
        inst.OpBase()->Type() == "share_data") {
      VLOG(4) << "Already inplaced, skip inplace now.";
      inplaced = true;
    }
  }
560

561
  if (FLAGS_new_executor_use_inplace && !inplaced) {
562 563 564
    BuildInplace();
  }

565 566 567 568 569 570 571 572 573 574
  // prepare for the first time.
  std::promise<std::unique_ptr<AtomicVectorSizeT>> deps_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_deps_ = deps_promise.get_future();
  deps_promise.set_value(interpreter::PrepareAtomicDeps(dependecy_count_));

  std::promise<std::unique_ptr<AtomicVectorSizeT>> var_ref_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_var_ref_ = var_ref_promise.get_future();
  var_ref_promise.set_value(
575
      interpreter::PrepareAtomicVarRef(var_scope_.VecMetaInfo()));
576 577
}

578 579 580
void InterpreterCore::BuildSkipShareLoDInfo() {
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    bool can_skip_lod = true;
581
    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
582 583 584 585 586 587 588 589 590 591 592 593
      for (auto& var : input.second) {
        if (var->IsType<LoDTensor>()) {
          if (var->Get<LoDTensor>().lod().size() != 0) {
            can_skip_lod = false;
            break;
          }
        } else {
          can_skip_lod = false;
          break;
        }
      }
    }
594
    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
595 596 597
  }
}

598
void InterpreterCore::RunInstruction(const Instruction& instr_node) {
599 600
  auto* op = instr_node.OpBase();
  auto place = instr_node.DeviceContext().GetPlace();
601 602
  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                           : var_scope_.GetMutableScope();
603
  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);
604
#ifdef PADDLE_WITH_ASCEND_CL
605 606 607 608 609 610 611 612 613 614
  if (platform::is_npu_place(place)) {
    auto dev_id = place.device;
    platform::SetNPUDeviceId(dev_id);
    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
    // values, but only through special `float_status` to checks whether
    // the operation is overflow. More about `float_status`, see:
    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
    if (FLAGS_check_nan_inf) {
      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
    }
615 616 617
  }
#endif

618
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
619
  {
620
    // If it is OperatorBase, InferShape do nothing.
621 622
    if (op_with_kernel != nullptr) {
      platform::RecordEvent infershape_event(
623 624 625
          "infer_shape",
          platform::TracerEventType::OperatorInner,
          1,
626
          platform::EventRole::kInnerOp);
627 628 629 630 631 632 633

      // see OperatorWithKernel::RunImpl in operator.cc for why
      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
        op_with_kernel->Info().infer_shape_(
            instr_node.InnerInferShapeContext().get());
      }
634 635 636 637 638
      infershape_event.End();
      platform::RecordOpInfoSupplement(op->Type(),
                                       op->Attrs(),
                                       *(instr_node.InnerInferShapeContext()),
                                       *(instr_node.InnerRuntimeContext()));
639
    }
640
  }
641 642
  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
    // TODO(xiongkun03) Does operator base support inplace ?
643
    for (auto& pair : instr_node.InplaceInfo()) {
644 645 646 647 648 649 650 651
      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
      auto* out =
          paddle::framework::details::GetMutableTensorFromVar(pair.second);
      if (in.dims() == out->dims()) {
        out->ShareBufferWith(in);
      }
    }
  }
652

653
  {
654
    platform::RecordEvent compute_event(
655 656 657
        "compute",
        platform::TracerEventType::OperatorInner,
        1,
658
        platform::EventRole::kInnerOp);
659 660 661
    if (op_with_kernel == nullptr) {
      instr_node.OpBase()->Run(*local_scope, place_);
    } else {
662 663 664
      // fit for phi
      if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) {
        VLOG(4) << "Run phi kernel: " << op->Type();
665 666
        VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                << &instr_node.DeviceContext();
667
        phi::KernelContext phi_kernel_context;
668
        op_with_kernel->BuildPhiKernelContext(
669
            *instr_node.InnerRuntimeContext().get(),
670
            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
671
            &phi_kernel_context);
672

673
        (*instr_node.PhiKernel())(&phi_kernel_context);
674 675 676 677

      } else {
        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
      }
678
    }
679
  }
680

681
  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope_);
682

683
  if (!instr_node.InplaceBackMap().empty()) {
L
liutiexing 已提交
684 685
    platform::RecordEvent inplaceback_event(
        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
686 687 688 689
    auto& m = instr_node.InplaceBackMap();
    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
    for (auto& p : m) {
      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
690
          var_scope_.VarRef(p.first));
691
      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
692
          var_scope_.VarRef(p.second));
693 694
      original_tensor->ShareDataWith(*transformed_tensor);
      VLOG(4) << "Transfer inplace variable back form "
695 696
              << var_scope_.GetNameById(p.first) << " to "
              << var_scope_.GetNameById(p.second);
697 698 699
    }
  }

700 701 702
  /*For profiling/benchmark only*/
  if (FLAGS_benchmark) {
    instr_node.DeviceContext().Wait();
703 704
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
705 706 707 708 709
    VLOG(4) << "Operator(" << op->Type()
            << "): context wait and get last error";
#endif
  }

710
  // for debug nan/inf
711
  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
712
    VLOG(4) << "Check nan/inf";
713
    framework::details::CheckOpHasNanOrInf(
714
        *op,
715
        *local_scope_,
716
        place);  // TODO(xiongkun03) change it to inner scope.
717
  }
718 719 720
}

void InterpreterCore::ExecuteInstructionList(
721
    const std::vector<Instruction>& vec_instr) {
722 723 724 725 726 727
  unfinished_op_numer_ = vec_instr.size();
  if (unfinished_op_numer_ == 0) {
    VLOG(4) << "No op to run, return";
    return;
  }

L
liutiexing 已提交
728 729
  platform::RecordEvent record_prepare(
      "PrepareAtomic", platform::TracerEventType::UserDefined, 1);
730 731
  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
  // those for the next step
732 733
  auto atomic_deps = atomic_deps_.get();
  auto atomic_var_ref = atomic_var_ref_.get();
734

735 736
  atomic_deps_ = async_work_queue_->PrepareAtomicDeps(dependecy_count_);
  atomic_var_ref_ =
737
      async_work_queue_->PrepareAtomicVarRef(var_scope_.VecMetaInfo());
L
liutiexing 已提交
738
  record_prepare.End();
739

740 741
  exception_holder_.Clear();

742 743
  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
    if (dependecy_count_[i] == 0) {
744
      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
745 746 747
                                 [this,
                                  i,
                                  atomic_deps = atomic_deps.get(),
748
                                  atomic_var_ref = atomic_var_ref.get()] {
749 750
                                   RunInstructionAsync(
                                       i, atomic_deps, atomic_var_ref);
751
                                 });
752 753 754
    }
  }

755
  auto event_name = main_thread_blocker_.WaitEvent();
756 757
  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
          << ") got event_name: " << event_name;
758

759
  if (UNLIKELY(exception_holder_.IsCaught())) {
760
    VLOG(1) << "Exception caught " << exception_holder_.Type();
761 762 763 764 765
    // Graceful exit when the executor encountered a fatal error.
    // EOF is not a fatal error.
    if (exception_holder_.Type() != "EOF") {
      async_work_queue_->Cancel();
    }
766
    VLOG(4) << "Cancel ok";
767
    PADDLE_ENFORCE_EQ(
768 769
        main_thread_blocker_.Clear(),
        0,
770 771
        platform::errors::PreconditionNotMet(
            "main_thread_blocker_.Clear() return -1, clear failed"));
772
    VLOG(4) << "clear ok";
773 774
    exception_holder_.ReThrow();
  }
775
}
776

L
liutiexing 已提交
777
void InterpreterCore::RunNextInstructions(
778 779
    const Instruction& instr,
    std::queue<size_t>* reserved_next_ops,
780 781
    std::vector<std::atomic<size_t>>* atomic_deps,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
782 783
  platform::RecordEvent record(
      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
L
liutiexing 已提交
784
  VLOG(4) << "atomic 1:" << atomic_deps;
785
  auto& next_instr = instr.NextInstructions();
786 787

  auto IsReady = [atomic_deps](size_t next_id) {
788 789
    VLOG(4) << "atomic:" << atomic_deps << " op_id: " << next_id
            << ", remain deps: " << (*atomic_deps)[next_id];
790
    return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
791 792
  };

793
  if (instr.KernelType() == OpFuncType::kQueueAsync) {
794
    // move all sync_ops into other threads
795
    for (auto next_id : next_instr.SyncRunIds()) {
796
      if (IsReady(next_id)) {
797
        async_work_queue_->AddTask(
798
            vec_instruction_[next_id].KernelType(),
799 800 801
            [this, next_id, atomic_deps, atomic_var_ref]() {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
802 803 804
      }
    }
    // keep all async_ops running in current thread
805
    for (auto next_id : next_instr.DirectRunIds()) {
806
      if (IsReady(next_id)) {
L
liutiexing 已提交
807
        reserved_next_ops->push(next_id);
808 809
      }
    }
810
    for (auto next_id : next_instr.EventRunIds()) {
811
      if (IsReady(next_id)) {
L
liutiexing 已提交
812
        reserved_next_ops->push(next_id);
813 814 815 816
      }
    }
  } else {
    // move async_ops into async_thread
817
    for (auto next_id : next_instr.EventRunIds()) {
818
      if (IsReady(next_id)) {
819
        async_work_queue_->AddTask(
820
            vec_instruction_[next_id].KernelType(),
821 822 823
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
824 825
      }
    }
826 827
    auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
                                                    next_instr.DirectRunIds());
828
    int64_t first_op = -1;
829
    for (auto next_id : direct_run_ops) {
830 831
      if (IsReady(next_id)) {
        // only keep one op running in current thread
832
        if (first_op == -1) {
833
          first_op = next_id;
834 835 836
          continue;
        }
        // move rest ops into other threads
837
        async_work_queue_->AddTask(
838
            vec_instruction_[next_id].KernelType(),
839 840 841
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
842 843
      }
    }
844
    if (first_op != -1) reserved_next_ops->push(first_op);
845 846 847
  }
}

848
void InterpreterCore::RunInstructionAsync(
849 850
    size_t instr_id,
    std::vector<std::atomic<size_t>>* atomic_deps,
851
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
L
liutiexing 已提交
852 853 854 855 856
  std::queue<size_t> ready_ops;
  ready_ops.push(instr_id);
  while (!ready_ops.empty()) {
    instr_id = ready_ops.front();
    ready_ops.pop();
857
    auto& instr_node = vec_instruction_.at(instr_id);
L
liutiexing 已提交
858
    VLOG(5) << __func__ << " OP id:" << instr_node.Id()
859 860 861 862
            << " name:" << instr_node.OpBase()->Type() << " type:"
            << (instr_node.KernelType() == OpFuncType::kQueueSync
                    ? "kQueueSync"
                    : "kQueueAsync")
L
liutiexing 已提交
863 864
            << " runs on " << platform::GetCurrentThreadName();

865
    auto* op = instr_node.OpBase();
L
liutiexing 已提交
866 867
    platform::RecordEvent instruction_event(
        op->Type(), platform::TracerEventType::Operator, 1);
868

869
    try {
870 871
      interpreter::WaitEvent(instr_node, place_);

872
      RunInstruction(instr_node);
873

874
      CheckGC(instr_node, atomic_var_ref);
875 876

      interpreter::RecordEvent(instr_node, place_);
877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898
    } catch (platform::EnforceNotMet& ex) {
      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
      exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
    } catch (platform::EOFException&) {
      exception_holder_.Catch(std::current_exception());
    } catch (std::exception& ex) {
      LOG(WARNING) << op->Type() << " raises an exception "
                   << platform::demangle(typeid(ex).name()) << ", "
                   << ex.what();
      exception_holder_.Catch(std::current_exception());
    } catch (...) {
      LOG(WARNING) << op->Type() << " raises an unknown exception";
      exception_holder_.Catch(std::current_exception());
    }

    if (UNLIKELY(exception_holder_.IsCaught())) {
      VLOG(4) << "Exception caught";
      if (exception_notifier_ != nullptr) {
        exception_notifier_->NotifyEvent();
      }
      return;
    }
899

900 901 902 903 904 905 906 907
    VLOG(4) << "unfinished_op_numer_: " << unfinished_op_numer_;
    if (UNLIKELY(unfinished_op_numer_.fetch_sub(1, std::memory_order_relaxed) ==
                 1)) {
      if (completion_notifier_ != nullptr) {
        completion_notifier_->NotifyEvent();
      }
    }

908
    RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
L
liutiexing 已提交
909
  }
910 911
}

912 913 914 915 916 917
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
  if (!IsInterpretercoreFastGCEnabled() ||
      instr.KernelType() != OpFuncType::kQueueAsync) {
    return;
  }
918 919
  platform::RecordEvent record(
      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
920

L
Leo Chen 已提交
921 922
  gpuStream_t stream =
      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
  auto TensorRecordStream = [&stream](Tensor& tensor) {
    auto allocation = tensor.Holder();
    if (allocation == nullptr) {
      return;
    }

    const platform::Place& place = allocation->place();
    if (platform::is_gpu_place(place)) {
      memory::RecordStream(allocation, stream);
    } else if (platform::is_cuda_pinned_place(place)) {
      // TODO(Ruibiao): Here should do something to make sure that the tensor is
      // not freed until the H2D copies done. However, simplely launch a CUDA
      // runtime callback to the H2D stream may lead a high performance
      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
      // present, we just log a WARNING here. A better design is required.
      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
                      "manner may lead a data inconsistent";
    } else {
      // memory copies involve CPUPlace are always synchronous, so just do
      // nothing here
    }
  };

  /* NOTE(Ruibiao):Cross-stream tensor synchronization is required only when
   * all the following conditions are satisfied:
   * 1. The tensor will be GC after running the instruction, i.e., in
   * instr.GCCheckVars.
   * 2. The stream which initializes this tensor is different from the stream
   * which the instruction run in.
   * 3. The tensor is the instruction's input, cause we assume that instruction
   * will initialize all output tensors with its running stream.
   * 4. In the OP function of this instruction, the tensor is an input of a
   * async CUDA kernel.
   *
   * Here we only process the first condition, because:
   * 1. Since the RecordStream function will directly return when the recored
   * stream is equal to the owning stream, recording a stream same as which
   * initialized this tensor has less time overhead. Conversely, it may take
   * more time if we try to extract those cross-stream input vars from
   * instr.GCCheckVars.
   * 2. Now the instruction has no idea of which vars involving async running in
   * OP function, and thus we can not recognize condition 4. It should be
   * supported later.
   */
  for (int var_id : instr.GCCheckVars()) {
968 969
    VLOG(4) << "GC sync " << var_scope_.GetNameById(var_id) << " "
            << var_scope_.VarDesc(var_id);
970 971

    // persistable var will be ignore while GC
972 973
    if (var_scope_.VarDesc(var_id) &&
        var_scope_.VarDesc(var_id)->Persistable()) {
974 975 976
      continue;
    }

977
    paddle::framework::Variable* var = var_scope_.VarRef(var_id);
978 979 980 981 982 983 984 985 986 987
    if (var == nullptr) {
      continue;
    }

    if (var->IsType<LoDTensor>()) {
      TensorRecordStream(*(var->GetMutable<LoDTensor>()));
    } else if (var->IsType<
                   operators::reader::
                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
      // do nothing
988
    } else if (var->IsType<phi::SelectedRows>()) {
989
      TensorRecordStream(
990
          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006
    } else if (var->IsType<LoDTensorArray>()) {
      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
      for (auto& tensor : *tensor_arr) {
        TensorRecordStream(tensor);
      }
    } else if (var->IsType<std::vector<Scope*>>()) {
      // do nothing
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "The variable(%s) is not supported in eager deletion.",
          framework::ToTypeName(var->Type())));
    }
  }
}
#endif

1007 1008 1009
void InterpreterCore::CheckGC(
    const Instruction& instr,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
1010 1011
  platform::RecordEvent record(
      "CheckGC", platform::TracerEventType::UserDefined, 10);
1012 1013 1014
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  RecordStreamForGC(instr);
#endif
1015
  auto& var_scope = var_scope_;
1016

1017
  for (auto var_id : instr.GCCheckVars()) {
1018
    VLOG(4) << "GC " << var_scope_.GetNameById(var_id) << " "
L
Leo Chen 已提交
1019
            << var_scope.VarDesc(var_id);
1020 1021
    VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
            << " " << var_id;
1022
    bool is_ready =
1023
        (*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
1024 1025 1026 1027 1028
    // ignore all persistable var while GC
    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
      continue;
    }
    if (is_ready) {
X
xiongkun 已提交
1029 1030
      VLOG(6) << "Async delete variable with name : "
              << var_scope.GetNameById(var_id);
1031
      gc_->Add(var_scope_.VarRef(var_id), instr);
W
wanghuancoder 已提交
1032 1033 1034 1035
    }
  }
}

1036 1037
void InterpreterCore::Prepare(
    const std::vector<std::string>& feed_names,
1038 1039 1040 1041
    const std::vector<framework::LoDTensor>& feed_tensors,
    bool prepare_feed) {
  PADDLE_ENFORCE_EQ(feed_names.size(),
                    feed_tensors.size(),
1042 1043 1044
                    platform::errors::PreconditionNotMet(
                        "Required feed_names.size() == feed_tensors.size(), "
                        "but received %d != %d",
1045 1046
                        feed_names.size(),
                        feed_tensors.size()));
1047
  auto FeedInput = [&] {
1048
    VLOG(4) << "Feed inputs";
1049
    for (size_t i = 0; i < feed_names.size(); ++i) {
1050
      auto* feed_var = local_scope_->FindVar(feed_names[i]);
1051
      PADDLE_ENFORCE_NOT_NULL(
1052 1053 1054
          feed_var,
          platform::errors::NotFound("Variable %s should not be nullptr.",
                                     feed_names[i]));
1055

1056
      auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
1057
      feed_tensor->ShareDataWith(feed_tensors[i]);
1058
      feed_tensor->set_lod(feed_tensors[i].lod());
1059 1060 1061
    }
  };

1062
  if (!is_build_) {
1063
    paddle::framework::interpreter::build_variable_scope(
1064
        block_, &var_scope_, create_local_scope_);
1065
    FeedInput();
L
Leo Chen 已提交
1066
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
1067 1068 1069 1070
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
1071
                                                       &var_scope_,
1072 1073
                                                       create_local_scope_,
                                                       used_for_jit_);
1074
    is_build_ = true;
1075
    SetFeedVarsInplaceSkip(feed_names);
1076
    // convert vec func_list to graph
L
Leo Chen 已提交
1077
    Convert(&op_func_nodes);
1078
  }
W
wanghuancoder 已提交
1079 1080
  // NOTE: Because feed_tensor will be GC after
  // paddle::framework::build_op_func_list, so we should
1081
  // call FeedInput again.
1082 1083 1084
  if (prepare_feed) {
    FeedInput();
  }
1085 1086
}

1087 1088 1089
void InterpreterCore::SetFeedVarsInplaceSkip(
    const std::vector<std::string>& feed_names) {
  for (auto& feed_name : feed_names) {
1090
    var_scope_.SetVarSikpInplace(feed_name, true);
1091 1092 1093
  }
}

1094
std::shared_ptr<InterpreterCore> CreateInterpreterCore(
1095 1096
    const platform::Place& place,
    const ProgramDesc& prog,
1097
    Scope* scope,
1098
    const std::vector<std::string>& fetch_names,
1099 1100 1101 1102 1103 1104 1105 1106
    const std::set<std::string>& skip_gc_vars) {
  std::shared_ptr<InterpreterCore> core = nullptr;
  // NOTE(Aurelius84): `add_fetch` will modify BlockDesc, so we should copy
  // a new program.
  auto new_prog = std::make_shared<framework::ProgramDesc>(prog);
  auto* block = new_prog->MutableBlock(0);
  interpreter::add_fetch(fetch_names, block);

1107
  core = std::make_shared<InterpreterCore>(place, *block, skip_gc_vars, scope);
1108 1109 1110 1111
  core->SetCopyProgram(new_prog);
  return core;
}

1112 1113
}  // namespace framework
}  // namespace paddle