interpretercore.cc 38.0 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14

15
#include "paddle/fluid/framework/new_executor/interpretercore.h"
16

17
#include <unordered_set>
18

19
#include "paddle/fluid/framework/details/nan_inf_utils.h"
20
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
21 22
#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
#include "paddle/fluid/framework/operator.h"
L
liutiexing 已提交
23 24
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
25
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
26
#include "paddle/phi/common/place.h"
27
#include "paddle/phi/core/kernel_context.h"
L
Leo Chen 已提交
28 29 30
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
31
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
32

33
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
34
                            false,
35
                            "Use inplace in new executor");
36 37
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                            true,
38 39
                            "Use local_scope in new executor(especially used "
                            "in UT), can turn off for better performance");
40

41
DECLARE_bool(check_nan_inf);
42
DECLARE_bool(benchmark);
43

44
constexpr const char* kExceptionCaught = "ExceptionCaught";
45
constexpr const char* kTaskCompletion = "TaskCompletion";
46

47 48
namespace paddle {
namespace framework {
49 50
// NOTE(Aurelius84): Need a better strategy to determine it.
static constexpr size_t kHostNumThreads = 4;
51
static constexpr size_t kDeviceNumThreads = 1;
52

53 54
InterpreterCore::InterpreterCore(const platform::Place& place,
                                 const BlockDesc& block,
55
                                 const std::set<std::string>& skip_gc_vars,
56
                                 framework::Scope* scope)
W
wanghuancoder 已提交
57
    : place_(place),
58
      block_(block),
59
      skip_gc_vars_(skip_gc_vars),
60
      var_scope_(scope),
61
      stream_analyzer_(place) {
L
Leo Chen 已提交
62
  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
63

64
  is_build_ = false;
65

L
liutiexing 已提交
66
  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
67
  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
68

69
  create_local_scope_ = FLAGS_new_executor_use_local_scope;
70 71 72 73
  VLOG(4) << "create_local_scope_ is " << create_local_scope_;

  if (create_local_scope_) {
    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
74 75
    local_scope_ = local_scope;
  }
76
  var_scope_.SetLocalScope(local_scope_);
77

W
wanghuancoder 已提交
78 79 80 81 82 83 84
  // prune

  // optmize graph pass

  // convert to run graph
}

85 86 87 88
InterpreterCore::~InterpreterCore() {
  // cancle gc's thread
  gc_.reset(nullptr);

89 90
  async_work_queue_.reset();
  VLOG(4) << "~InterpreterCore(): " << this << " on " << place_;
L
Leo Chen 已提交
91 92 93 94 95 96

#ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
  platform::ClearMKLDNNCache(place_, this);
#endif
97 98
}

99 100 101
interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
102 103 104 105 106
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
107 108 109 110 111 112 113 114 115 116
  Prepare(feed_names, feed_tensors, true);
  interpreter::CostInfo cost_info;
  {
    interpreter::ProfilerGuard(place_, &cost_info);

    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
    async_work_queue_ = GetWorkQueue();

L
Leo Chen 已提交
117 118 119 120 121
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

122 123 124 125 126 127 128 129 130
    ExecuteInstructionList(vec_instruction_);
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }

  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

  return cost_info;
131 132
}

W
wanghuancoder 已提交
133
paddle::framework::FetchList InterpreterCore::Run(
134
    const std::vector<std::string>& feed_names,
135
    const std::vector<framework::LoDTensor>& feed_tensors) {
136 137 138 139 140
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
L
Leo Chen 已提交
141 142 143
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
144 145
  bool is_build = is_build_;
  Prepare(feed_names, feed_tensors, is_build);
146

147
  if (is_build) {
148 149 150
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
151
    async_work_queue_ = GetWorkQueue();
L
Leo Chen 已提交
152 153 154 155 156 157

    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

158
    ExecuteInstructionList(vec_instruction_);
159 160 161
#ifdef PADDLE_WITH_ASCEND_CL
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif
162
  }
163 164 165 166
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

W
wanghuancoder 已提交
167
  // return Fetch Tensors
168 169 170 171 172 173
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
174 175
}

176 177
paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names) {
178 179 180 181 182
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
L
Leo Chen 已提交
183 184 185
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
186
  if (!is_build_) {
187 188
    paddle::framework::interpreter::build_variable_scope(block_, &var_scope_);

189
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
190 191 192 193
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
194
                                                       &var_scope_,
195
                                                       create_local_scope_);
196
    is_build_ = true;
197
    SetFeedVarsInplaceSkip(feed_names);
198 199
    // convert vec func_list to graph
    Convert(&op_func_nodes);
200

201
  } else {
202 203 204
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
205
    async_work_queue_ = GetWorkQueue();
206

L
Leo Chen 已提交
207 208 209 210 211
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

212
    ExecuteInstructionList(vec_instruction_);
213 214 215
#ifdef PADDLE_WITH_ASCEND_CL
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif
216 217
  }

218 219 220
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }
221
  // return Fetch Tensors
222 223 224 225 226 227
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
228 229
}

230 231 232 233 234 235 236 237 238 239 240
void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
  copy_program_ = prog;
}

void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
  async_work_queue_ = src->GetWorkQueue();
  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << &src
          << ") to InterpreterCore(" << this << ")";
}

bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
241
  if (!var_scope_.VarDesc(var_index)) {
242 243 244 245 246 247
    return input_var2op_info_.at(var_index).size() == 1;
  } else {
    int is_input_cnt = 0;
    for (auto inst_id : input_var2op_info_.at(var_index)) {
      OpInOutInfo info;
      info.Build(vec_instruction_.at(inst_id).OpBase());
248
      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
        is_input_cnt++;
      }
    }
    return is_input_cnt == 1;
  }
}

std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
  if (async_work_queue_ == nullptr) {
    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
        kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
  }
  return async_work_queue_;
}

void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
  VariableValueMap ins_map;
  for (auto& var_name_item : instr_node->Inputs()) {
    std::vector<Variable*> input_vars;

    input_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
271 272
      input_vars.emplace_back(
          local_scope_->FindVar(var_scope_.GetNameById(id)));
273 274 275 276 277 278 279 280 281 282
    }
    ins_map.emplace(var_name_item.first, std::move(input_vars));
  }

  VariableValueMap outs_map;
  for (auto& var_name_item : instr_node->Outputs()) {
    std::vector<Variable*> out_vars;

    out_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
283
      out_vars.emplace_back(local_scope_->FindVar(var_scope_.GetNameById(id)));
284 285 286 287 288 289 290
    }
    outs_map.emplace(var_name_item.first, std::move(out_vars));
  }

  // set runtime_ctx and infershape_ctx_
  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
                                                        // kernel
291 292
    Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                             : var_scope_.GetMutableScope();
293 294 295 296 297 298 299
    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
  } else {
    instr_node->ResetContext(ins_map, outs_map);
  }
}

void InterpreterCore::BuildInplace() {
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput Tensor and a list
  // of Output Tensors which are sliced from the FusedOutput. These outputs
  // sholud not be the outvar of the in-place var-pair since memory reuse
  // between FusedOutput and Output Tensors is assumed. For the following
  // example:
  // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
  // var1 = sum(var4, var5)
  // ...
  //
  // After running coalesce_tensor_op, var1 is assumed to share the buffer
  // slices from fused_var. However, if sum_op is in-place, then var1 would
  // re-share the buffer with var4 instead of fused_var.
  std::set<std::string> skip_inplace_outvars;
  for (Instruction& instr : vec_instruction_) {
    OperatorBase* op = instr.OpBase();
    if (op->Type() == "coalesce_tensor") {
      const std::vector<std::string>& outputs =
          op->OutputVars(/*has_intermediate=*/false);
      skip_inplace_outvars.insert(outputs.begin(), outputs.end());
    }
  }

322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    auto& instr = vec_instruction_[i];
    auto* op_base = instr.OpBase();
    if (!op_base->Info().infer_inplace_) {
      continue;
    }

    auto in_to_outs = op_base->Info().infer_inplace_(
        platform::is_gpu_place(instr.DeviceContext().GetPlace()));

    auto& inputs = instr.Inputs();
    auto& outputs = instr.Outputs();
    for (auto& pair : in_to_outs) {
      auto iter = inputs.find(pair.first);
      if (iter != inputs.end() && !iter->second.empty()) {
337
        auto in_var_desc = var_scope_.VarDesc(iter->second[0]);
338 339 340
        if (in_var_desc && in_var_desc->Persistable()) {
          continue;
        }
341
        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
342 343 344 345 346
          continue;
        }
        if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
          auto iterout = outputs.find(pair.second);
          if (iterout != outputs.end() && !iterout->second.empty()) {
347 348 349 350 351 352 353
            const std::string& invar_name =
                var_scope_.GetNameById(iter->second[0]);
            const std::string& outvar_name =
                var_scope_.GetNameById(iterout->second[0]);
            auto invar = local_scope_->FindVar(invar_name);
            auto outvar = local_scope_->FindVar(outvar_name);

354
            if (invar && outvar && invar->IsType<LoDTensor>() &&
355 356 357
                outvar->IsType<LoDTensor>() &&
                skip_inplace_outvars.find(outvar_name) ==
                    skip_inplace_outvars.end()) {
358
              instr.AddInplace(invar, outvar);
359 360
              VLOG(3) << "inplace " << op_base->Type() << " " << invar_name
                      << " -> " << outvar_name;
361 362 363 364
            }
          }
        }
      }
365 366 367 368
    }
  }
}

X
xiongkun 已提交
369 370 371 372 373
void InterpreterCore::BuildOperatorDependences() {
  // analysis the dependences between ops, set the dependecy_count_ and Call
  // Schedule
  auto op_nums = vec_instruction_.size();
  dependecy_count_.resize(op_nums);
374
  auto op2downstream = dependency_builder_.Build(vec_instruction_);
X
xiongkun 已提交
375 376 377 378 379 380 381 382 383 384 385
  for (size_t op = 0; op < vec_instruction_.size(); ++op) {
    auto op_list = op2downstream[op];
    std::vector<size_t> downsteam_vector(op_list.begin(), op_list.end());
    stream_analyzer_.Schedule(downsteam_vector, &vec_instruction_, op);

    for (auto inst_id : op_list) {
      dependecy_count_[inst_id]++;
    }
  }
}

386 387 388 389 390 391 392 393 394 395 396 397 398
// At the end of each step, the holder of Tensor in LoDTensorArray is null.
// Clear these Tensors and leave LoDTensorArray empty, otherwise an exception
// will occur in the next step
void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
  auto vars = local_scope_->LocalVars();
  for (auto var : vars) {
    if (var->IsType<LoDTensorArray>()) {
      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
      lod_tensor_arr->clear();
    }
  }
}

L
Leo Chen 已提交
399 400
void InterpreterCore::Convert(
    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
401 402
  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
  auto var_nums = var_scope_.VarSize();
403
  input_var2op_info_.resize(var_nums);
L
Leo Chen 已提交
404
  auto nodes = *op_func_nodes;
405

L
Leo Chen 已提交
406
  auto op_nums = nodes.size();
407 408
  vec_instruction_.reserve(op_nums);
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
L
Leo Chen 已提交
409
    auto& op_func_node = nodes[op_idx];
410
    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
L
Leo Chen 已提交
411
    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
412
  }
413

414 415 416 417 418
  BuildOperatorDependences();

  // calculate last_live_ops_
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
    auto& instr = vec_instruction_[op_idx];
419
    OpInOutInfo info;
420
    std::set<size_t> gc_check_inputs;
421

422
    for (auto& item : instr.Inputs()) {
423
      for (auto id : item.second) {
W
wanghuancoder 已提交
424 425 426
        if (id == kEmptyVarIndex) {
          continue;
        }
427
        input_var2op_info_.at(id).push_back(op_idx);
W
wanghuancoder 已提交
428 429
        // var can be gc-ed
        if (!info.IsBuilt()) {
430
          info.Build(instr.OpBase());
W
wanghuancoder 已提交
431
        }
432
        auto* var_desc = var_scope_.VarDesc(id);
433 434
        if (var_desc) {
          if (info.IsInArgBufferNeeded(var_desc->Name())) {
435
            gc_check_inputs.insert(id);
W
wanghuancoder 已提交
436 437
          }
        } else {
438
          gc_check_inputs.insert(id);
W
wanghuancoder 已提交
439
        }
440 441
      }
    }
442

443
    for (auto var_id : gc_check_inputs) {
444 445
      paddle::framework::Variable* var =
          local_scope_->FindVar(var_scope_.GetNameById(var_id));
446 447
      if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
          var->IsType<LoDTensorArray>()) {
448
        last_live_ops_[var_id].insert(op_idx);
449
      } else {
450 451
        VLOG(4) << "not clear " << var_scope_.GetNameById(var_id) << " after "
                << instr.OpBase()->Type() << " because its type is "
452 453
                << framework::ToTypeName(var->Type());
      }
454 455 456
    }
  }

W
wanghuancoder 已提交
457
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
458
    // checkout output
459
    for (auto& item : vec_instruction_[i].Outputs()) {
460 461 462
      for (auto var_id : item.second) {
        if (input_var2op_info_.at(var_id).size() == 0) {
          last_live_ops_[var_id].insert(i);
W
wanghuancoder 已提交
463 464 465 466 467
        }
      }
    }
  }

468 469
  // clear the last_live_ops list for all vars in skip_gc_vars
  for (const std::string& skip_gc_var : skip_gc_vars_) {
470
    int var_id = var_scope_.GetIdByName(skip_gc_var);
471 472 473 474 475 476
    if (var_id != -1) {
      last_live_ops_[var_id].clear();
      VLOG(8) << "Skip gc for var: " << skip_gc_var;
    }
  }

477 478 479 480 481 482 483 484 485 486 487 488 489
  // shrink, find the downstream op that has no other op in the
  // downstream list happens before it
  // For example,
  // b = op1(a)
  // c = op2(a, b)
  // in this case, a is the input of op1 and op2, we only need to check
  // a after op2, because op2 always uses a after op1.
  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
    std::set<size_t> minumum_last_live_ops;
    for (size_t item : last_live_ops_[i]) {
      bool not_before_any = true;
      // find the op that is not executed before any
      for (size_t other_item : last_live_ops_[i]) {
490
        if (dependency_builder_.OpHappensBefore(item, other_item)) {
491 492 493 494 495 496 497 498
          VLOG(8) << "happens_before: " << item << "->" << other_item
                  << ", so skip " << item;
          not_before_any = false;
          break;
        }
      }
      if (not_before_any) {
        VLOG(8) << "last live op of var " << i << " "
499
                << var_scope_.GetNameById(i) << " : " << item << " "
500 501 502 503 504 505 506 507
                << vec_instruction_[item].OpBase()->Type();
        minumum_last_live_ops.insert(item);
        vec_instruction_[item].AddGCCheckVar(i);
      }
    }
    last_live_ops_[i] = minumum_last_live_ops;
    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
  }
508 509

  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
510
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
511
  }
W
wanghuancoder 已提交
512

513
  BuildSkipShareLoDInfo();
L
Leo Chen 已提交
514

515 516 517 518 519 520 521 522
  bool inplaced = false;
  for (auto inst : vec_instruction_) {
    if (inst.OpBase()->Type() == "share_buffer" ||
        inst.OpBase()->Type() == "share_data") {
      VLOG(4) << "Already inplaced, skip inplace now.";
      inplaced = true;
    }
  }
523

524
  if (FLAGS_new_executor_use_inplace && !inplaced) {
525 526 527
    BuildInplace();
  }

528 529 530 531 532 533 534 535 536 537
  // prepare for the first time.
  std::promise<std::unique_ptr<AtomicVectorSizeT>> deps_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_deps_ = deps_promise.get_future();
  deps_promise.set_value(interpreter::PrepareAtomicDeps(dependecy_count_));

  std::promise<std::unique_ptr<AtomicVectorSizeT>> var_ref_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_var_ref_ = var_ref_promise.get_future();
  var_ref_promise.set_value(
538
      interpreter::PrepareAtomicVarRef(var_scope_.VecMetaInfo()));
539 540
}

541 542 543
void InterpreterCore::BuildSkipShareLoDInfo() {
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    bool can_skip_lod = true;
544
    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
545 546 547 548 549 550 551 552 553 554 555 556
      for (auto& var : input.second) {
        if (var->IsType<LoDTensor>()) {
          if (var->Get<LoDTensor>().lod().size() != 0) {
            can_skip_lod = false;
            break;
          }
        } else {
          can_skip_lod = false;
          break;
        }
      }
    }
557
    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
558 559 560
  }
}

561
void InterpreterCore::RunInstruction(const Instruction& instr_node) {
562 563
  auto* op = instr_node.OpBase();
  auto place = instr_node.DeviceContext().GetPlace();
564 565 566
  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);
  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                           : var_scope_.GetMutableScope();
567 568

#ifdef PADDLE_WITH_ASCEND_CL
569 570 571 572 573 574 575 576 577 578
  if (platform::is_npu_place(place)) {
    auto dev_id = place.device;
    platform::SetNPUDeviceId(dev_id);
    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
    // values, but only through special `float_status` to checks whether
    // the operation is overflow. More about `float_status`, see:
    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
    if (FLAGS_check_nan_inf) {
      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
    }
579 580 581
  }
#endif

582
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
583
  {
584
    // If it is OperatorBase, InferShape do nothing.
585 586
    if (op_with_kernel != nullptr) {
      platform::RecordEvent infershape_event(
587 588 589
          "infer_shape",
          platform::TracerEventType::OperatorInner,
          1,
590
          platform::EventRole::kInnerOp);
591 592 593 594 595 596 597

      // see OperatorWithKernel::RunImpl in operator.cc for why
      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
        op_with_kernel->Info().infer_shape_(
            instr_node.InnerInferShapeContext().get());
      }
598 599 600 601 602
      infershape_event.End();
      platform::RecordOpInfoSupplement(op->Type(),
                                       op->Attrs(),
                                       *(instr_node.InnerInferShapeContext()),
                                       *(instr_node.InnerRuntimeContext()));
603
    }
604
  }
605

606 607
  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
    // TODO(xiongkun03) Does operator base support inplace ?
608
    for (auto& pair : instr_node.InplaceInfo()) {
609 610 611 612 613 614 615 616
      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
      auto* out =
          paddle::framework::details::GetMutableTensorFromVar(pair.second);
      if (in.dims() == out->dims()) {
        out->ShareBufferWith(in);
      }
    }
  }
617

618
  {
619
    platform::RecordEvent compute_event(
620 621 622
        "compute",
        platform::TracerEventType::OperatorInner,
        1,
623
        platform::EventRole::kInnerOp);
624 625 626
    if (op_with_kernel == nullptr) {
      instr_node.OpBase()->Run(*local_scope, place_);
    } else {
627 628 629
      // fit for phi
      if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) {
        VLOG(4) << "Run phi kernel: " << op->Type();
630 631
        VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                << &instr_node.DeviceContext();
632
        phi::KernelContext phi_kernel_context;
633
        op_with_kernel->BuildPhiKernelContext(
634
            *instr_node.InnerRuntimeContext().get(),
635
            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
636
            &phi_kernel_context);
637

638
        (*instr_node.PhiKernel())(&phi_kernel_context);
639 640 641 642

      } else {
        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
      }
643
    }
644
  }
645

646
  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope_);
647

648
  if (!instr_node.InplaceBackMap().empty()) {
L
liutiexing 已提交
649 650
    platform::RecordEvent inplaceback_event(
        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
651 652 653 654
    auto& m = instr_node.InplaceBackMap();
    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
    for (auto& p : m) {
      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
655
          var_scope_.VarRef(p.first));
656
      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
657
          var_scope_.VarRef(p.second));
658 659
      original_tensor->ShareDataWith(*transformed_tensor);
      VLOG(4) << "Transfer inplace variable back form "
660 661
              << var_scope_.GetNameById(p.first) << " to "
              << var_scope_.GetNameById(p.second);
662 663 664
    }
  }

665 666 667
  /*For profiling/benchmark only*/
  if (FLAGS_benchmark) {
    instr_node.DeviceContext().Wait();
668 669
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
670 671 672 673 674
    VLOG(4) << "Operator(" << op->Type()
            << "): context wait and get last error";
#endif
  }

675
  // for debug nan/inf
676
  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
677
    VLOG(4) << "Check nan/inf";
678
    framework::details::CheckOpHasNanOrInf(
679
        *op,
680
        *local_scope_,
681
        place);  // TODO(xiongkun03) change it to inner scope.
682
  }
683 684 685
}

void InterpreterCore::ExecuteInstructionList(
686
    const std::vector<Instruction>& vec_instr) {
687 688 689 690 691 692
  unfinished_op_numer_ = vec_instr.size();
  if (unfinished_op_numer_ == 0) {
    VLOG(4) << "No op to run, return";
    return;
  }

L
liutiexing 已提交
693 694
  platform::RecordEvent record_prepare(
      "PrepareAtomic", platform::TracerEventType::UserDefined, 1);
695 696
  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
  // those for the next step
697 698
  auto atomic_deps = atomic_deps_.get();
  auto atomic_var_ref = atomic_var_ref_.get();
699

700 701
  atomic_deps_ = async_work_queue_->PrepareAtomicDeps(dependecy_count_);
  atomic_var_ref_ =
702
      async_work_queue_->PrepareAtomicVarRef(var_scope_.VecMetaInfo());
L
liutiexing 已提交
703
  record_prepare.End();
704

705 706
  exception_holder_.Clear();

707 708
  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
    if (dependecy_count_[i] == 0) {
709
      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
710 711 712
                                 [this,
                                  i,
                                  atomic_deps = atomic_deps.get(),
713
                                  atomic_var_ref = atomic_var_ref.get()] {
714 715
                                   RunInstructionAsync(
                                       i, atomic_deps, atomic_var_ref);
716
                                 });
717 718 719
    }
  }

720
  auto event_name = main_thread_blocker_.WaitEvent();
721 722
  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
          << ") got event_name: " << event_name;
723

724
  if (UNLIKELY(exception_holder_.IsCaught())) {
725
    VLOG(1) << "Exception caught " << exception_holder_.Type();
726 727 728 729 730
    // Graceful exit when the executor encountered a fatal error.
    // EOF is not a fatal error.
    if (exception_holder_.Type() != "EOF") {
      async_work_queue_->Cancel();
    }
731
    VLOG(4) << "Cancel ok";
732
    PADDLE_ENFORCE_EQ(
733 734
        main_thread_blocker_.Clear(),
        0,
735 736
        platform::errors::PreconditionNotMet(
            "main_thread_blocker_.Clear() return -1, clear failed"));
737
    VLOG(4) << "clear ok";
738 739
    exception_holder_.ReThrow();
  }
740
}
741

L
liutiexing 已提交
742
void InterpreterCore::RunNextInstructions(
743 744
    const Instruction& instr,
    std::queue<size_t>* reserved_next_ops,
745 746
    std::vector<std::atomic<size_t>>* atomic_deps,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
747 748
  platform::RecordEvent record(
      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
L
liutiexing 已提交
749
  VLOG(4) << "atomic 1:" << atomic_deps;
750
  auto& next_instr = instr.NextInstructions();
751 752

  auto IsReady = [atomic_deps](size_t next_id) {
753 754
    VLOG(4) << "atomic:" << atomic_deps << " op_id: " << next_id
            << ", remain deps: " << (*atomic_deps)[next_id];
755
    return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
756 757
  };

758
  if (instr.KernelType() == OpFuncType::kQueueAsync) {
759
    // move all sync_ops into other threads
760
    for (auto next_id : next_instr.SyncRunIds()) {
761
      if (IsReady(next_id)) {
762
        async_work_queue_->AddTask(
763
            vec_instruction_[next_id].KernelType(),
764 765 766
            [this, next_id, atomic_deps, atomic_var_ref]() {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
767 768 769
      }
    }
    // keep all async_ops running in current thread
770
    for (auto next_id : next_instr.DirectRunIds()) {
771
      if (IsReady(next_id)) {
L
liutiexing 已提交
772
        reserved_next_ops->push(next_id);
773 774
      }
    }
775
    for (auto next_id : next_instr.EventRunIds()) {
776
      if (IsReady(next_id)) {
L
liutiexing 已提交
777
        reserved_next_ops->push(next_id);
778 779 780 781
      }
    }
  } else {
    // move async_ops into async_thread
782
    for (auto next_id : next_instr.EventRunIds()) {
783
      if (IsReady(next_id)) {
784
        async_work_queue_->AddTask(
785
            vec_instruction_[next_id].KernelType(),
786 787 788
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
789 790
      }
    }
791 792
    auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
                                                    next_instr.DirectRunIds());
793 794
    size_t first_op = 0;
    for (auto next_id : direct_run_ops) {
795 796
      if (IsReady(next_id)) {
        // only keep one op running in current thread
797 798
        if (first_op == 0) {
          first_op = next_id;
799 800 801
          continue;
        }
        // move rest ops into other threads
802
        async_work_queue_->AddTask(
803
            vec_instruction_[next_id].KernelType(),
804 805 806
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
807 808
      }
    }
L
liutiexing 已提交
809
    if (first_op != 0) reserved_next_ops->push(first_op);
810 811 812
  }
}

813
void InterpreterCore::RunInstructionAsync(
814 815
    size_t instr_id,
    std::vector<std::atomic<size_t>>* atomic_deps,
816
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
L
liutiexing 已提交
817 818 819 820 821
  std::queue<size_t> ready_ops;
  ready_ops.push(instr_id);
  while (!ready_ops.empty()) {
    instr_id = ready_ops.front();
    ready_ops.pop();
822
    auto& instr_node = vec_instruction_.at(instr_id);
L
liutiexing 已提交
823
    VLOG(5) << __func__ << " OP id:" << instr_node.Id()
824 825 826 827
            << " name:" << instr_node.OpBase()->Type() << " type:"
            << (instr_node.KernelType() == OpFuncType::kQueueSync
                    ? "kQueueSync"
                    : "kQueueAsync")
L
liutiexing 已提交
828 829
            << " runs on " << platform::GetCurrentThreadName();

830
    auto* op = instr_node.OpBase();
L
liutiexing 已提交
831 832
    platform::RecordEvent instruction_event(
        op->Type(), platform::TracerEventType::Operator, 1);
833

834
    try {
835 836
      interpreter::WaitEvent(instr_node, place_);

837
      RunInstruction(instr_node);
838

839
      CheckGC(instr_node, atomic_var_ref);
840 841

      interpreter::RecordEvent(instr_node, place_);
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863
    } catch (platform::EnforceNotMet& ex) {
      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
      exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
    } catch (platform::EOFException&) {
      exception_holder_.Catch(std::current_exception());
    } catch (std::exception& ex) {
      LOG(WARNING) << op->Type() << " raises an exception "
                   << platform::demangle(typeid(ex).name()) << ", "
                   << ex.what();
      exception_holder_.Catch(std::current_exception());
    } catch (...) {
      LOG(WARNING) << op->Type() << " raises an unknown exception";
      exception_holder_.Catch(std::current_exception());
    }

    if (UNLIKELY(exception_holder_.IsCaught())) {
      VLOG(4) << "Exception caught";
      if (exception_notifier_ != nullptr) {
        exception_notifier_->NotifyEvent();
      }
      return;
    }
864

865 866 867 868 869 870 871 872
    VLOG(4) << "unfinished_op_numer_: " << unfinished_op_numer_;
    if (UNLIKELY(unfinished_op_numer_.fetch_sub(1, std::memory_order_relaxed) ==
                 1)) {
      if (completion_notifier_ != nullptr) {
        completion_notifier_->NotifyEvent();
      }
    }

873
    RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
L
liutiexing 已提交
874
  }
875 876
}

877 878 879 880 881 882
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
  if (!IsInterpretercoreFastGCEnabled() ||
      instr.KernelType() != OpFuncType::kQueueAsync) {
    return;
  }
883 884
  platform::RecordEvent record(
      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
885

L
Leo Chen 已提交
886 887
  gpuStream_t stream =
      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932
  auto TensorRecordStream = [&stream](Tensor& tensor) {
    auto allocation = tensor.Holder();
    if (allocation == nullptr) {
      return;
    }

    const platform::Place& place = allocation->place();
    if (platform::is_gpu_place(place)) {
      memory::RecordStream(allocation, stream);
    } else if (platform::is_cuda_pinned_place(place)) {
      // TODO(Ruibiao): Here should do something to make sure that the tensor is
      // not freed until the H2D copies done. However, simplely launch a CUDA
      // runtime callback to the H2D stream may lead a high performance
      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
      // present, we just log a WARNING here. A better design is required.
      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
                      "manner may lead a data inconsistent";
    } else {
      // memory copies involve CPUPlace are always synchronous, so just do
      // nothing here
    }
  };

  /* NOTE(Ruibiao):Cross-stream tensor synchronization is required only when
   * all the following conditions are satisfied:
   * 1. The tensor will be GC after running the instruction, i.e., in
   * instr.GCCheckVars.
   * 2. The stream which initializes this tensor is different from the stream
   * which the instruction run in.
   * 3. The tensor is the instruction's input, cause we assume that instruction
   * will initialize all output tensors with its running stream.
   * 4. In the OP function of this instruction, the tensor is an input of a
   * async CUDA kernel.
   *
   * Here we only process the first condition, because:
   * 1. Since the RecordStream function will directly return when the recored
   * stream is equal to the owning stream, recording a stream same as which
   * initialized this tensor has less time overhead. Conversely, it may take
   * more time if we try to extract those cross-stream input vars from
   * instr.GCCheckVars.
   * 2. Now the instruction has no idea of which vars involving async running in
   * OP function, and thus we can not recognize condition 4. It should be
   * supported later.
   */
  for (int var_id : instr.GCCheckVars()) {
933 934
    VLOG(4) << "GC sync " << var_scope_.GetNameById(var_id) << " "
            << var_scope_.VarDesc(var_id);
935 936

    // persistable var will be ignore while GC
937 938
    if (var_scope_.VarDesc(var_id) &&
        var_scope_.VarDesc(var_id)->Persistable()) {
939 940 941
      continue;
    }

942
    paddle::framework::Variable* var = var_scope_.VarRef(var_id);
943 944 945 946 947 948 949 950 951 952
    if (var == nullptr) {
      continue;
    }

    if (var->IsType<LoDTensor>()) {
      TensorRecordStream(*(var->GetMutable<LoDTensor>()));
    } else if (var->IsType<
                   operators::reader::
                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
      // do nothing
953
    } else if (var->IsType<phi::SelectedRows>()) {
954
      TensorRecordStream(
955
          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971
    } else if (var->IsType<LoDTensorArray>()) {
      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
      for (auto& tensor : *tensor_arr) {
        TensorRecordStream(tensor);
      }
    } else if (var->IsType<std::vector<Scope*>>()) {
      // do nothing
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "The variable(%s) is not supported in eager deletion.",
          framework::ToTypeName(var->Type())));
    }
  }
}
#endif

972 973 974
void InterpreterCore::CheckGC(
    const Instruction& instr,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
975 976
  platform::RecordEvent record(
      "CheckGC", platform::TracerEventType::UserDefined, 10);
977 978 979
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  RecordStreamForGC(instr);
#endif
980
  auto& var_scope = var_scope_;
981

982
  for (auto var_id : instr.GCCheckVars()) {
983
    VLOG(4) << "GC " << var_scope_.GetNameById(var_id) << " "
L
Leo Chen 已提交
984
            << var_scope.VarDesc(var_id);
985 986
    VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
            << " " << var_id;
987
    bool is_ready =
988
        (*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
989 990 991 992 993
    // ignore all persistable var while GC
    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
      continue;
    }
    if (is_ready) {
X
xiongkun 已提交
994 995
      VLOG(6) << "Async delete variable with name : "
              << var_scope.GetNameById(var_id);
996
      gc_->Add(var_scope_.VarRef(var_id), instr);
W
wanghuancoder 已提交
997 998 999 1000
    }
  }
}

1001 1002
void InterpreterCore::Prepare(
    const std::vector<std::string>& feed_names,
1003 1004 1005 1006
    const std::vector<framework::LoDTensor>& feed_tensors,
    bool prepare_feed) {
  PADDLE_ENFORCE_EQ(feed_names.size(),
                    feed_tensors.size(),
1007 1008 1009
                    platform::errors::PreconditionNotMet(
                        "Required feed_names.size() == feed_tensors.size(), "
                        "but received %d != %d",
1010 1011
                        feed_names.size(),
                        feed_tensors.size()));
1012

1013
  auto FeedInput = [&] {
1014
    VLOG(4) << "Feed inputs";
1015
    for (size_t i = 0; i < feed_names.size(); ++i) {
1016
      auto* feed_var = local_scope_->FindVar(feed_names[i]);
1017
      PADDLE_ENFORCE_NOT_NULL(
1018 1019 1020
          feed_var,
          platform::errors::NotFound("Variable %s should not be nullptr.",
                                     feed_names[i]));
1021

1022
      auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
1023
      feed_tensor->ShareDataWith(feed_tensors[i]);
1024
      feed_tensor->set_lod(feed_tensors[i].lod());
1025 1026 1027
    }
  };

1028
  if (!is_build_) {
1029
    paddle::framework::interpreter::build_variable_scope(
1030
        block_, &var_scope_, create_local_scope_);
1031
    FeedInput();
L
Leo Chen 已提交
1032
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
1033 1034 1035 1036
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
1037
                                                       &var_scope_,
1038
                                                       create_local_scope_);
1039
    is_build_ = true;
1040
    SetFeedVarsInplaceSkip(feed_names);
1041
    // convert vec func_list to graph
L
Leo Chen 已提交
1042
    Convert(&op_func_nodes);
1043
  }
W
wanghuancoder 已提交
1044 1045
  // NOTE: Because feed_tensor will be GC after
  // paddle::framework::build_op_func_list, so we should
1046
  // call FeedInput again.
1047 1048 1049
  if (prepare_feed) {
    FeedInput();
  }
1050 1051
}

1052 1053 1054
void InterpreterCore::SetFeedVarsInplaceSkip(
    const std::vector<std::string>& feed_names) {
  for (auto& feed_name : feed_names) {
1055
    var_scope_.SetVarSikpInplace(feed_name, true);
1056 1057 1058
  }
}

1059
std::shared_ptr<InterpreterCore> CreateInterpreterCore(
1060 1061
    const platform::Place& place,
    const ProgramDesc& prog,
1062
    Scope* scope,
1063
    const std::vector<std::string>& fetch_names,
1064 1065 1066 1067 1068 1069 1070 1071
    const std::set<std::string>& skip_gc_vars) {
  std::shared_ptr<InterpreterCore> core = nullptr;
  // NOTE(Aurelius84): `add_fetch` will modify BlockDesc, so we should copy
  // a new program.
  auto new_prog = std::make_shared<framework::ProgramDesc>(prog);
  auto* block = new_prog->MutableBlock(0);
  interpreter::add_fetch(fetch_names, block);

1072
  core = std::make_shared<InterpreterCore>(place, *block, skip_gc_vars, scope);
1073 1074 1075 1076
  core->SetCopyProgram(new_prog);
  return core;
}

1077 1078
}  // namespace framework
}  // namespace paddle