interpretercore.cc 37.8 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14

15
#include "paddle/fluid/framework/new_executor/interpretercore.h"
16

17
#include <unordered_set>
18

19
#include "paddle/fluid/framework/details/nan_inf_utils.h"
20
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
21 22
#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
#include "paddle/fluid/framework/operator.h"
L
liutiexing 已提交
23 24
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
25
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
26
#include "paddle/phi/common/place.h"
27
#include "paddle/phi/core/kernel_context.h"
L
Leo Chen 已提交
28 29 30
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
31
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
32

33
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
34
                            false,
35
                            "Use inplace in new executor");
36 37
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                            true,
38 39
                            "Use local_scope in new executor(especially used "
                            "in UT), can turn off for better performance");
40

41
DECLARE_bool(check_nan_inf);
42
DECLARE_bool(benchmark);
43

44
constexpr const char* kExceptionCaught = "ExceptionCaught";
45
constexpr const char* kTaskCompletion = "TaskCompletion";
46

47 48
namespace paddle {
namespace framework {
49 50
// NOTE(Aurelius84): Need a better strategy to determine it.
static constexpr size_t kHostNumThreads = 4;
51
static constexpr size_t kDeviceNumThreads = 1;
52

53 54
InterpreterCore::InterpreterCore(const platform::Place& place,
                                 const BlockDesc& block,
55
                                 const std::set<std::string>& skip_gc_vars,
56
                                 framework::Scope* scope)
W
wanghuancoder 已提交
57
    : place_(place),
58
      block_(block),
59
      skip_gc_vars_(skip_gc_vars),
60
      var_scope_(scope),
61
      stream_analyzer_(place) {
L
Leo Chen 已提交
62
  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
63

64
  is_build_ = false;
65

L
liutiexing 已提交
66
  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
67
  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
68

69
  create_local_scope_ = FLAGS_new_executor_use_local_scope;
70 71 72 73
  VLOG(4) << "create_local_scope_ is " << create_local_scope_;

  if (create_local_scope_) {
    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
74 75
    local_scope_ = local_scope;
  }
76
  var_scope_.SetLocalScope(local_scope_);
77

W
wanghuancoder 已提交
78 79 80 81 82 83 84
  // prune

  // optmize graph pass

  // convert to run graph
}

85 86 87 88
InterpreterCore::~InterpreterCore() {
  // cancle gc's thread
  gc_.reset(nullptr);

89 90
  async_work_queue_.reset();
  VLOG(4) << "~InterpreterCore(): " << this << " on " << place_;
L
Leo Chen 已提交
91 92 93 94 95 96

#ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
  platform::ClearMKLDNNCache(place_, this);
#endif
97 98
}

99 100 101
interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
102 103 104 105 106
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
107 108 109 110 111 112 113 114 115 116
  Prepare(feed_names, feed_tensors, true);
  interpreter::CostInfo cost_info;
  {
    interpreter::ProfilerGuard(place_, &cost_info);

    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
    async_work_queue_ = GetWorkQueue();

L
Leo Chen 已提交
117 118 119 120 121
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

122 123 124 125 126 127 128 129 130
    ExecuteInstructionList(vec_instruction_);
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }

  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

  return cost_info;
131 132
}

W
wanghuancoder 已提交
133
paddle::framework::FetchList InterpreterCore::Run(
134
    const std::vector<std::string>& feed_names,
135
    const std::vector<framework::LoDTensor>& feed_tensors) {
136 137 138 139 140
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
L
Leo Chen 已提交
141 142 143
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
144 145
  bool is_build = is_build_;
  Prepare(feed_names, feed_tensors, is_build);
146

147
  if (is_build) {
148 149 150
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
151
    async_work_queue_ = GetWorkQueue();
L
Leo Chen 已提交
152 153 154 155 156 157

    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

158
    ExecuteInstructionList(vec_instruction_);
159 160 161
#ifdef PADDLE_WITH_ASCEND_CL
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif
162
  }
163 164 165 166
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

W
wanghuancoder 已提交
167
  // return Fetch Tensors
168 169 170 171 172 173
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
174 175
}

176 177
paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names) {
178 179 180 181 182
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
L
Leo Chen 已提交
183 184 185
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
186
  if (!is_build_) {
187 188
    paddle::framework::interpreter::build_variable_scope(block_, &var_scope_);

189
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
190 191 192 193
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
194
                                                       &var_scope_,
195
                                                       create_local_scope_);
196
    is_build_ = true;
197
    SetFeedVarsInplaceSkip(feed_names);
198 199
    // convert vec func_list to graph
    Convert(&op_func_nodes);
200

201
  } else {
202 203 204
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
205
    async_work_queue_ = GetWorkQueue();
206

L
Leo Chen 已提交
207 208 209 210 211
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

212
    ExecuteInstructionList(vec_instruction_);
213 214 215
#ifdef PADDLE_WITH_ASCEND_CL
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif
216 217
  }

218 219 220
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }
221
  // return Fetch Tensors
222 223 224 225 226 227
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
228 229
}

230 231 232 233 234 235 236 237 238 239 240
void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
  copy_program_ = prog;
}

void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
  async_work_queue_ = src->GetWorkQueue();
  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << &src
          << ") to InterpreterCore(" << this << ")";
}

bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
241
  if (!var_scope_.VarDesc(var_index)) {
242 243 244 245 246 247
    return input_var2op_info_.at(var_index).size() == 1;
  } else {
    int is_input_cnt = 0;
    for (auto inst_id : input_var2op_info_.at(var_index)) {
      OpInOutInfo info;
      info.Build(vec_instruction_.at(inst_id).OpBase());
248
      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270
        is_input_cnt++;
      }
    }
    return is_input_cnt == 1;
  }
}

std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
  if (async_work_queue_ == nullptr) {
    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
        kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
  }
  return async_work_queue_;
}

void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
  VariableValueMap ins_map;
  for (auto& var_name_item : instr_node->Inputs()) {
    std::vector<Variable*> input_vars;

    input_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
271 272
      input_vars.emplace_back(
          local_scope_->FindVar(var_scope_.GetNameById(id)));
273 274 275 276 277 278 279 280 281 282
    }
    ins_map.emplace(var_name_item.first, std::move(input_vars));
  }

  VariableValueMap outs_map;
  for (auto& var_name_item : instr_node->Outputs()) {
    std::vector<Variable*> out_vars;

    out_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
283
      out_vars.emplace_back(local_scope_->FindVar(var_scope_.GetNameById(id)));
284 285 286 287 288 289 290
    }
    outs_map.emplace(var_name_item.first, std::move(out_vars));
  }

  // set runtime_ctx and infershape_ctx_
  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
                                                        // kernel
291 292
    Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                             : var_scope_.GetMutableScope();
293 294 295 296 297 298 299
    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
  } else {
    instr_node->ResetContext(ins_map, outs_map);
  }
}

void InterpreterCore::BuildInplace() {
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput Tensor and a list
  // of Output Tensors which are sliced from the FusedOutput. These outputs
  // sholud not be the outvar of the in-place var-pair since memory reuse
  // between FusedOutput and Output Tensors is assumed. For the following
  // example:
  // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
  // var1 = sum(var4, var5)
  // ...
  //
  // After running coalesce_tensor_op, var1 is assumed to share the buffer
  // slices from fused_var. However, if sum_op is in-place, then var1 would
  // re-share the buffer with var4 instead of fused_var.
  std::set<std::string> skip_inplace_outvars;
  for (Instruction& instr : vec_instruction_) {
    OperatorBase* op = instr.OpBase();
    if (op->Type() == "coalesce_tensor") {
      const std::vector<std::string>& outputs =
          op->OutputVars(/*has_intermediate=*/false);
      skip_inplace_outvars.insert(outputs.begin(), outputs.end());
    }
  }

322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    auto& instr = vec_instruction_[i];
    auto* op_base = instr.OpBase();
    if (!op_base->Info().infer_inplace_) {
      continue;
    }

    auto in_to_outs = op_base->Info().infer_inplace_(
        platform::is_gpu_place(instr.DeviceContext().GetPlace()));

    auto& inputs = instr.Inputs();
    auto& outputs = instr.Outputs();
    for (auto& pair : in_to_outs) {
      auto iter = inputs.find(pair.first);
      if (iter != inputs.end() && !iter->second.empty()) {
337
        auto in_var_desc = var_scope_.VarDesc(iter->second[0]);
338 339 340
        if (in_var_desc && in_var_desc->Persistable()) {
          continue;
        }
341
        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
342 343 344 345 346
          continue;
        }
        if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
          auto iterout = outputs.find(pair.second);
          if (iterout != outputs.end() && !iterout->second.empty()) {
347 348 349 350 351 352 353
            const std::string& invar_name =
                var_scope_.GetNameById(iter->second[0]);
            const std::string& outvar_name =
                var_scope_.GetNameById(iterout->second[0]);
            auto invar = local_scope_->FindVar(invar_name);
            auto outvar = local_scope_->FindVar(outvar_name);

354
            if (invar && outvar && invar->IsType<LoDTensor>() &&
355 356 357
                outvar->IsType<LoDTensor>() &&
                skip_inplace_outvars.find(outvar_name) ==
                    skip_inplace_outvars.end()) {
358
              instr.AddInplace(invar, outvar);
359 360
              VLOG(3) << "inplace " << op_base->Type() << " " << invar_name
                      << " -> " << outvar_name;
361 362 363 364
            }
          }
        }
      }
365 366 367 368
    }
  }
}

X
xiongkun 已提交
369 370 371 372 373
void InterpreterCore::BuildOperatorDependences() {
  // analysis the dependences between ops, set the dependecy_count_ and Call
  // Schedule
  auto op_nums = vec_instruction_.size();
  dependecy_count_.resize(op_nums);
374
  auto op2downstream = dependency_builder_.Build(vec_instruction_);
X
xiongkun 已提交
375 376 377 378 379 380 381 382 383 384 385
  for (size_t op = 0; op < vec_instruction_.size(); ++op) {
    auto op_list = op2downstream[op];
    std::vector<size_t> downsteam_vector(op_list.begin(), op_list.end());
    stream_analyzer_.Schedule(downsteam_vector, &vec_instruction_, op);

    for (auto inst_id : op_list) {
      dependecy_count_[inst_id]++;
    }
  }
}

386 387 388 389 390 391 392 393 394 395 396 397 398
// At the end of each step, the holder of Tensor in LoDTensorArray is null.
// Clear these Tensors and leave LoDTensorArray empty, otherwise an exception
// will occur in the next step
void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
  auto vars = local_scope_->LocalVars();
  for (auto var : vars) {
    if (var->IsType<LoDTensorArray>()) {
      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
      lod_tensor_arr->clear();
    }
  }
}

L
Leo Chen 已提交
399 400
void InterpreterCore::Convert(
    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
401 402
  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
  auto var_nums = var_scope_.VarSize();
403
  input_var2op_info_.resize(var_nums);
L
Leo Chen 已提交
404
  auto nodes = *op_func_nodes;
405

L
Leo Chen 已提交
406
  auto op_nums = nodes.size();
407 408
  vec_instruction_.reserve(op_nums);
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
L
Leo Chen 已提交
409
    auto& op_func_node = nodes[op_idx];
410
    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
L
Leo Chen 已提交
411
    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
412
  }
413

414 415 416 417 418
  BuildOperatorDependences();

  // calculate last_live_ops_
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
    auto& instr = vec_instruction_[op_idx];
419
    OpInOutInfo info;
420
    std::set<size_t> gc_check_inputs;
421

422
    for (auto& item : instr.Inputs()) {
423
      for (auto id : item.second) {
W
wanghuancoder 已提交
424 425 426
        if (id == kEmptyVarIndex) {
          continue;
        }
427
        input_var2op_info_.at(id).push_back(op_idx);
W
wanghuancoder 已提交
428 429
        // var can be gc-ed
        if (!info.IsBuilt()) {
430
          info.Build(instr.OpBase());
W
wanghuancoder 已提交
431
        }
432
        auto* var_desc = var_scope_.VarDesc(id);
433 434
        if (var_desc) {
          if (info.IsInArgBufferNeeded(var_desc->Name())) {
435
            gc_check_inputs.insert(id);
W
wanghuancoder 已提交
436 437
          }
        } else {
438
          gc_check_inputs.insert(id);
W
wanghuancoder 已提交
439
        }
440 441
      }
    }
442

443
    for (auto var_id : gc_check_inputs) {
444 445
      paddle::framework::Variable* var =
          local_scope_->FindVar(var_scope_.GetNameById(var_id));
446 447
      if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
          var->IsType<LoDTensorArray>()) {
448
        last_live_ops_[var_id].insert(op_idx);
449
      } else {
450 451
        VLOG(4) << "not clear " << var_scope_.GetNameById(var_id) << " after "
                << instr.OpBase()->Type() << " because its type is "
452 453
                << framework::ToTypeName(var->Type());
      }
454 455 456
    }
  }

W
wanghuancoder 已提交
457
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
458
    // checkout output
459
    for (auto& item : vec_instruction_[i].Outputs()) {
460 461 462
      for (auto var_id : item.second) {
        if (input_var2op_info_.at(var_id).size() == 0) {
          last_live_ops_[var_id].insert(i);
W
wanghuancoder 已提交
463 464 465 466 467
        }
      }
    }
  }

468 469
  // clear the last_live_ops list for all vars in skip_gc_vars
  for (const std::string& skip_gc_var : skip_gc_vars_) {
470
    int var_id = var_scope_.GetIdByName(skip_gc_var);
471 472 473 474 475 476
    if (var_id != -1) {
      last_live_ops_[var_id].clear();
      VLOG(8) << "Skip gc for var: " << skip_gc_var;
    }
  }

477 478 479 480 481 482 483 484 485 486 487 488 489
  // shrink, find the downstream op that has no other op in the
  // downstream list happens before it
  // For example,
  // b = op1(a)
  // c = op2(a, b)
  // in this case, a is the input of op1 and op2, we only need to check
  // a after op2, because op2 always uses a after op1.
  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
    std::set<size_t> minumum_last_live_ops;
    for (size_t item : last_live_ops_[i]) {
      bool not_before_any = true;
      // find the op that is not executed before any
      for (size_t other_item : last_live_ops_[i]) {
490
        if (dependency_builder_.OpHappensBefore(item, other_item)) {
491 492 493 494 495 496 497 498
          VLOG(8) << "happens_before: " << item << "->" << other_item
                  << ", so skip " << item;
          not_before_any = false;
          break;
        }
      }
      if (not_before_any) {
        VLOG(8) << "last live op of var " << i << " "
499
                << var_scope_.GetNameById(i) << " : " << item << " "
500 501 502 503 504 505 506 507
                << vec_instruction_[item].OpBase()->Type();
        minumum_last_live_ops.insert(item);
        vec_instruction_[item].AddGCCheckVar(i);
      }
    }
    last_live_ops_[i] = minumum_last_live_ops;
    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
  }
508 509

  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
510
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
511
  }
W
wanghuancoder 已提交
512

513
  BuildSkipShareLoDInfo();
L
Leo Chen 已提交
514

515 516 517 518 519 520 521 522
  bool inplaced = false;
  for (auto inst : vec_instruction_) {
    if (inst.OpBase()->Type() == "share_buffer" ||
        inst.OpBase()->Type() == "share_data") {
      VLOG(4) << "Already inplaced, skip inplace now.";
      inplaced = true;
    }
  }
523

524
  if (FLAGS_new_executor_use_inplace && !inplaced) {
525 526 527
    BuildInplace();
  }

528 529 530 531 532 533 534 535 536 537
  // prepare for the first time.
  std::promise<std::unique_ptr<AtomicVectorSizeT>> deps_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_deps_ = deps_promise.get_future();
  deps_promise.set_value(interpreter::PrepareAtomicDeps(dependecy_count_));

  std::promise<std::unique_ptr<AtomicVectorSizeT>> var_ref_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_var_ref_ = var_ref_promise.get_future();
  var_ref_promise.set_value(
538
      interpreter::PrepareAtomicVarRef(var_scope_.VecMetaInfo()));
539 540
}

541 542 543
void InterpreterCore::BuildSkipShareLoDInfo() {
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    bool can_skip_lod = true;
544
    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
545 546 547 548 549 550 551 552 553 554 555 556
      for (auto& var : input.second) {
        if (var->IsType<LoDTensor>()) {
          if (var->Get<LoDTensor>().lod().size() != 0) {
            can_skip_lod = false;
            break;
          }
        } else {
          can_skip_lod = false;
          break;
        }
      }
    }
557
    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
558 559 560
  }
}

561
void InterpreterCore::RunInstruction(const Instruction& instr_node) {
562 563
  auto* op = instr_node.OpBase();
  auto place = instr_node.DeviceContext().GetPlace();
564 565 566
  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);
  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                           : var_scope_.GetMutableScope();
567 568 569 570 571 572 573 574 575 576 577

#ifdef PADDLE_WITH_ASCEND_CL
  // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
  // values, but only through special `float_status` to checks whether
  // the operation is overflow. More about `float_status`, see:
  // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
  if (FLAGS_check_nan_inf) {
    framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
  }
#endif

578
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
579
  {
580
    // If it is OperatorBase, InferShape do nothing.
581 582
    if (op_with_kernel != nullptr) {
      platform::RecordEvent infershape_event(
583 584 585
          "infer_shape",
          platform::TracerEventType::OperatorInner,
          1,
586
          platform::EventRole::kInnerOp);
587 588 589 590 591 592 593

      // see OperatorWithKernel::RunImpl in operator.cc for why
      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
        op_with_kernel->Info().infer_shape_(
            instr_node.InnerInferShapeContext().get());
      }
594 595 596 597 598
      infershape_event.End();
      platform::RecordOpInfoSupplement(op->Type(),
                                       op->Attrs(),
                                       *(instr_node.InnerInferShapeContext()),
                                       *(instr_node.InnerRuntimeContext()));
599
    }
600
  }
601

602 603
  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
    // TODO(xiongkun03) Does operator base support inplace ?
604
    for (auto& pair : instr_node.InplaceInfo()) {
605 606 607 608 609 610 611 612
      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
      auto* out =
          paddle::framework::details::GetMutableTensorFromVar(pair.second);
      if (in.dims() == out->dims()) {
        out->ShareBufferWith(in);
      }
    }
  }
613

614
  {
615
    platform::RecordEvent compute_event(
616 617 618
        "compute",
        platform::TracerEventType::OperatorInner,
        1,
619
        platform::EventRole::kInnerOp);
620 621 622
    if (op_with_kernel == nullptr) {
      instr_node.OpBase()->Run(*local_scope, place_);
    } else {
623 624 625
      // fit for phi
      if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) {
        VLOG(4) << "Run phi kernel: " << op->Type();
626 627
        VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                << &instr_node.DeviceContext();
628
        phi::KernelContext phi_kernel_context;
629
        op_with_kernel->BuildPhiKernelContext(
630
            *instr_node.InnerRuntimeContext().get(),
631
            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
632
            &phi_kernel_context);
633

634
        (*instr_node.PhiKernel())(&phi_kernel_context);
635 636 637 638

      } else {
        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
      }
639
    }
640
  }
641

642
  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope_);
643

644
  if (!instr_node.InplaceBackMap().empty()) {
L
liutiexing 已提交
645 646
    platform::RecordEvent inplaceback_event(
        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
647 648 649 650
    auto& m = instr_node.InplaceBackMap();
    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
    for (auto& p : m) {
      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
651
          var_scope_.VarRef(p.first));
652
      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
653
          var_scope_.VarRef(p.second));
654 655
      original_tensor->ShareDataWith(*transformed_tensor);
      VLOG(4) << "Transfer inplace variable back form "
656 657
              << var_scope_.GetNameById(p.first) << " to "
              << var_scope_.GetNameById(p.second);
658 659 660
    }
  }

661 662 663
  /*For profiling/benchmark only*/
  if (FLAGS_benchmark) {
    instr_node.DeviceContext().Wait();
664 665
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
666 667 668 669 670
    VLOG(4) << "Operator(" << op->Type()
            << "): context wait and get last error";
#endif
  }

671
  // for debug nan/inf
672
  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
673
    VLOG(4) << "Check nan/inf";
674
    framework::details::CheckOpHasNanOrInf(
675
        *op,
676
        *local_scope_,
677
        place);  // TODO(xiongkun03) change it to inner scope.
678
  }
679 680 681
}

void InterpreterCore::ExecuteInstructionList(
682
    const std::vector<Instruction>& vec_instr) {
683 684 685 686 687 688
  unfinished_op_numer_ = vec_instr.size();
  if (unfinished_op_numer_ == 0) {
    VLOG(4) << "No op to run, return";
    return;
  }

L
liutiexing 已提交
689 690
  platform::RecordEvent record_prepare(
      "PrepareAtomic", platform::TracerEventType::UserDefined, 1);
691 692
  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
  // those for the next step
693 694
  auto atomic_deps = atomic_deps_.get();
  auto atomic_var_ref = atomic_var_ref_.get();
695

696 697
  atomic_deps_ = async_work_queue_->PrepareAtomicDeps(dependecy_count_);
  atomic_var_ref_ =
698
      async_work_queue_->PrepareAtomicVarRef(var_scope_.VecMetaInfo());
L
liutiexing 已提交
699
  record_prepare.End();
700

701 702
  exception_holder_.Clear();

703 704
  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
    if (dependecy_count_[i] == 0) {
705
      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
706 707 708
                                 [this,
                                  i,
                                  atomic_deps = atomic_deps.get(),
709
                                  atomic_var_ref = atomic_var_ref.get()] {
710 711
                                   RunInstructionAsync(
                                       i, atomic_deps, atomic_var_ref);
712
                                 });
713 714 715
    }
  }

716
  auto event_name = main_thread_blocker_.WaitEvent();
717 718
  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
          << ") got event_name: " << event_name;
719

720
  if (UNLIKELY(exception_holder_.IsCaught())) {
721
    VLOG(1) << "Exception caught " << exception_holder_.Type();
722 723 724 725 726
    // Graceful exit when the executor encountered a fatal error.
    // EOF is not a fatal error.
    if (exception_holder_.Type() != "EOF") {
      async_work_queue_->Cancel();
    }
727
    VLOG(4) << "Cancel ok";
728
    PADDLE_ENFORCE_EQ(
729 730
        main_thread_blocker_.Clear(),
        0,
731 732
        platform::errors::PreconditionNotMet(
            "main_thread_blocker_.Clear() return -1, clear failed"));
733
    VLOG(4) << "clear ok";
734 735
    exception_holder_.ReThrow();
  }
736
}
737

L
liutiexing 已提交
738
void InterpreterCore::RunNextInstructions(
739 740
    const Instruction& instr,
    std::queue<size_t>* reserved_next_ops,
741 742
    std::vector<std::atomic<size_t>>* atomic_deps,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
743 744
  platform::RecordEvent record(
      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
L
liutiexing 已提交
745
  VLOG(4) << "atomic 1:" << atomic_deps;
746
  auto& next_instr = instr.NextInstructions();
747 748

  auto IsReady = [atomic_deps](size_t next_id) {
749 750
    VLOG(4) << "atomic:" << atomic_deps << " op_id: " << next_id
            << ", remain deps: " << (*atomic_deps)[next_id];
751
    return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
752 753
  };

754
  if (instr.KernelType() == OpFuncType::kQueueAsync) {
755
    // move all sync_ops into other threads
756
    for (auto next_id : next_instr.SyncRunIds()) {
757
      if (IsReady(next_id)) {
758
        async_work_queue_->AddTask(
759
            vec_instruction_[next_id].KernelType(),
760 761 762
            [this, next_id, atomic_deps, atomic_var_ref]() {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
763 764 765
      }
    }
    // keep all async_ops running in current thread
766
    for (auto next_id : next_instr.DirectRunIds()) {
767
      if (IsReady(next_id)) {
L
liutiexing 已提交
768
        reserved_next_ops->push(next_id);
769 770
      }
    }
771
    for (auto next_id : next_instr.EventRunIds()) {
772
      if (IsReady(next_id)) {
L
liutiexing 已提交
773
        reserved_next_ops->push(next_id);
774 775 776 777
      }
    }
  } else {
    // move async_ops into async_thread
778
    for (auto next_id : next_instr.EventRunIds()) {
779
      if (IsReady(next_id)) {
780
        async_work_queue_->AddTask(
781
            vec_instruction_[next_id].KernelType(),
782 783 784
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
785 786
      }
    }
787 788
    auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
                                                    next_instr.DirectRunIds());
789 790
    size_t first_op = 0;
    for (auto next_id : direct_run_ops) {
791 792
      if (IsReady(next_id)) {
        // only keep one op running in current thread
793 794
        if (first_op == 0) {
          first_op = next_id;
795 796 797
          continue;
        }
        // move rest ops into other threads
798
        async_work_queue_->AddTask(
799
            vec_instruction_[next_id].KernelType(),
800 801 802
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
803 804
      }
    }
L
liutiexing 已提交
805
    if (first_op != 0) reserved_next_ops->push(first_op);
806 807 808
  }
}

809
void InterpreterCore::RunInstructionAsync(
810 811
    size_t instr_id,
    std::vector<std::atomic<size_t>>* atomic_deps,
812
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
L
liutiexing 已提交
813 814 815 816 817
  std::queue<size_t> ready_ops;
  ready_ops.push(instr_id);
  while (!ready_ops.empty()) {
    instr_id = ready_ops.front();
    ready_ops.pop();
818
    auto& instr_node = vec_instruction_.at(instr_id);
L
liutiexing 已提交
819
    VLOG(5) << __func__ << " OP id:" << instr_node.Id()
820 821 822 823
            << " name:" << instr_node.OpBase()->Type() << " type:"
            << (instr_node.KernelType() == OpFuncType::kQueueSync
                    ? "kQueueSync"
                    : "kQueueAsync")
L
liutiexing 已提交
824 825
            << " runs on " << platform::GetCurrentThreadName();

826
    auto* op = instr_node.OpBase();
L
liutiexing 已提交
827 828
    platform::RecordEvent instruction_event(
        op->Type(), platform::TracerEventType::Operator, 1);
829

830
    try {
831 832
      interpreter::WaitEvent(instr_node, place_);

833
      RunInstruction(instr_node);
834

835
      CheckGC(instr_node, atomic_var_ref);
836 837

      interpreter::RecordEvent(instr_node, place_);
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859
    } catch (platform::EnforceNotMet& ex) {
      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
      exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
    } catch (platform::EOFException&) {
      exception_holder_.Catch(std::current_exception());
    } catch (std::exception& ex) {
      LOG(WARNING) << op->Type() << " raises an exception "
                   << platform::demangle(typeid(ex).name()) << ", "
                   << ex.what();
      exception_holder_.Catch(std::current_exception());
    } catch (...) {
      LOG(WARNING) << op->Type() << " raises an unknown exception";
      exception_holder_.Catch(std::current_exception());
    }

    if (UNLIKELY(exception_holder_.IsCaught())) {
      VLOG(4) << "Exception caught";
      if (exception_notifier_ != nullptr) {
        exception_notifier_->NotifyEvent();
      }
      return;
    }
860

861 862 863 864 865 866 867 868
    VLOG(4) << "unfinished_op_numer_: " << unfinished_op_numer_;
    if (UNLIKELY(unfinished_op_numer_.fetch_sub(1, std::memory_order_relaxed) ==
                 1)) {
      if (completion_notifier_ != nullptr) {
        completion_notifier_->NotifyEvent();
      }
    }

869
    RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
L
liutiexing 已提交
870
  }
871 872
}

873 874 875 876 877 878
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
  if (!IsInterpretercoreFastGCEnabled() ||
      instr.KernelType() != OpFuncType::kQueueAsync) {
    return;
  }
879 880
  platform::RecordEvent record(
      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
881

L
Leo Chen 已提交
882 883
  gpuStream_t stream =
      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928
  auto TensorRecordStream = [&stream](Tensor& tensor) {
    auto allocation = tensor.Holder();
    if (allocation == nullptr) {
      return;
    }

    const platform::Place& place = allocation->place();
    if (platform::is_gpu_place(place)) {
      memory::RecordStream(allocation, stream);
    } else if (platform::is_cuda_pinned_place(place)) {
      // TODO(Ruibiao): Here should do something to make sure that the tensor is
      // not freed until the H2D copies done. However, simplely launch a CUDA
      // runtime callback to the H2D stream may lead a high performance
      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
      // present, we just log a WARNING here. A better design is required.
      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
                      "manner may lead a data inconsistent";
    } else {
      // memory copies involve CPUPlace are always synchronous, so just do
      // nothing here
    }
  };

  /* NOTE(Ruibiao):Cross-stream tensor synchronization is required only when
   * all the following conditions are satisfied:
   * 1. The tensor will be GC after running the instruction, i.e., in
   * instr.GCCheckVars.
   * 2. The stream which initializes this tensor is different from the stream
   * which the instruction run in.
   * 3. The tensor is the instruction's input, cause we assume that instruction
   * will initialize all output tensors with its running stream.
   * 4. In the OP function of this instruction, the tensor is an input of a
   * async CUDA kernel.
   *
   * Here we only process the first condition, because:
   * 1. Since the RecordStream function will directly return when the recored
   * stream is equal to the owning stream, recording a stream same as which
   * initialized this tensor has less time overhead. Conversely, it may take
   * more time if we try to extract those cross-stream input vars from
   * instr.GCCheckVars.
   * 2. Now the instruction has no idea of which vars involving async running in
   * OP function, and thus we can not recognize condition 4. It should be
   * supported later.
   */
  for (int var_id : instr.GCCheckVars()) {
929 930
    VLOG(4) << "GC sync " << var_scope_.GetNameById(var_id) << " "
            << var_scope_.VarDesc(var_id);
931 932

    // persistable var will be ignore while GC
933 934
    if (var_scope_.VarDesc(var_id) &&
        var_scope_.VarDesc(var_id)->Persistable()) {
935 936 937
      continue;
    }

938
    paddle::framework::Variable* var = var_scope_.VarRef(var_id);
939 940 941 942 943 944 945 946 947 948
    if (var == nullptr) {
      continue;
    }

    if (var->IsType<LoDTensor>()) {
      TensorRecordStream(*(var->GetMutable<LoDTensor>()));
    } else if (var->IsType<
                   operators::reader::
                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
      // do nothing
949
    } else if (var->IsType<phi::SelectedRows>()) {
950
      TensorRecordStream(
951
          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967
    } else if (var->IsType<LoDTensorArray>()) {
      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
      for (auto& tensor : *tensor_arr) {
        TensorRecordStream(tensor);
      }
    } else if (var->IsType<std::vector<Scope*>>()) {
      // do nothing
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "The variable(%s) is not supported in eager deletion.",
          framework::ToTypeName(var->Type())));
    }
  }
}
#endif

968 969 970
void InterpreterCore::CheckGC(
    const Instruction& instr,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
971 972
  platform::RecordEvent record(
      "CheckGC", platform::TracerEventType::UserDefined, 10);
973 974 975
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  RecordStreamForGC(instr);
#endif
976
  auto& var_scope = var_scope_;
977

978
  for (auto var_id : instr.GCCheckVars()) {
979
    VLOG(4) << "GC " << var_scope_.GetNameById(var_id) << " "
L
Leo Chen 已提交
980
            << var_scope.VarDesc(var_id);
981 982
    VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
            << " " << var_id;
983
    bool is_ready =
984
        (*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
985 986 987 988 989
    // ignore all persistable var while GC
    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
      continue;
    }
    if (is_ready) {
X
xiongkun 已提交
990 991
      VLOG(6) << "Async delete variable with name : "
              << var_scope.GetNameById(var_id);
992
      gc_->Add(var_scope_.VarRef(var_id), instr);
W
wanghuancoder 已提交
993 994 995 996
    }
  }
}

997 998
void InterpreterCore::Prepare(
    const std::vector<std::string>& feed_names,
999 1000 1001 1002
    const std::vector<framework::LoDTensor>& feed_tensors,
    bool prepare_feed) {
  PADDLE_ENFORCE_EQ(feed_names.size(),
                    feed_tensors.size(),
1003 1004 1005
                    platform::errors::PreconditionNotMet(
                        "Required feed_names.size() == feed_tensors.size(), "
                        "but received %d != %d",
1006 1007
                        feed_names.size(),
                        feed_tensors.size()));
1008

1009
  auto FeedInput = [&] {
1010
    VLOG(4) << "Feed inputs";
1011
    for (size_t i = 0; i < feed_names.size(); ++i) {
1012
      auto* feed_var = local_scope_->FindVar(feed_names[i]);
1013
      PADDLE_ENFORCE_NOT_NULL(
1014 1015 1016
          feed_var,
          platform::errors::NotFound("Variable %s should not be nullptr.",
                                     feed_names[i]));
1017

1018
      auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
1019
      feed_tensor->ShareDataWith(feed_tensors[i]);
1020
      feed_tensor->set_lod(feed_tensors[i].lod());
1021 1022 1023
    }
  };

1024
  if (!is_build_) {
1025
    paddle::framework::interpreter::build_variable_scope(
1026
        block_, &var_scope_, create_local_scope_);
1027
    FeedInput();
L
Leo Chen 已提交
1028
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
1029 1030 1031 1032
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
1033
                                                       &var_scope_,
1034
                                                       create_local_scope_);
1035
    is_build_ = true;
1036
    SetFeedVarsInplaceSkip(feed_names);
1037
    // convert vec func_list to graph
L
Leo Chen 已提交
1038
    Convert(&op_func_nodes);
1039
  }
W
wanghuancoder 已提交
1040 1041
  // NOTE: Because feed_tensor will be GC after
  // paddle::framework::build_op_func_list, so we should
1042
  // call FeedInput again.
1043 1044 1045
  if (prepare_feed) {
    FeedInput();
  }
1046 1047
}

1048 1049 1050
void InterpreterCore::SetFeedVarsInplaceSkip(
    const std::vector<std::string>& feed_names) {
  for (auto& feed_name : feed_names) {
1051
    var_scope_.SetVarSikpInplace(feed_name, true);
1052 1053 1054
  }
}

1055
std::shared_ptr<InterpreterCore> CreateInterpreterCore(
1056 1057
    const platform::Place& place,
    const ProgramDesc& prog,
1058
    Scope* scope,
1059
    const std::vector<std::string>& fetch_names,
1060 1061 1062 1063 1064 1065 1066 1067
    const std::set<std::string>& skip_gc_vars) {
  std::shared_ptr<InterpreterCore> core = nullptr;
  // NOTE(Aurelius84): `add_fetch` will modify BlockDesc, so we should copy
  // a new program.
  auto new_prog = std::make_shared<framework::ProgramDesc>(prog);
  auto* block = new_prog->MutableBlock(0);
  interpreter::add_fetch(fetch_names, block);

1068
  core = std::make_shared<InterpreterCore>(place, *block, skip_gc_vars, scope);
1069 1070 1071 1072
  core->SetCopyProgram(new_prog);
  return core;
}

1073 1074
}  // namespace framework
}  // namespace paddle