interpretercore.cc 37.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14

15
#include "paddle/fluid/framework/new_executor/interpretercore.h"
16

17
#include <unordered_set>
18

19
#include "paddle/fluid/framework/details/nan_inf_utils.h"
20
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
21 22
#include "paddle/fluid/framework/new_executor/garbage_collector/event_garbage_collector.h"
#include "paddle/fluid/framework/new_executor/garbage_collector/fast_garbage_collector.h"
23 24
#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
#include "paddle/fluid/framework/operator.h"
L
liutiexing 已提交
25 26
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
27
#include "paddle/phi/core/kernel_context.h"
L
Leo Chen 已提交
28 29 30
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
31

32 33
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
                            true,
34
                            "Use inplace in new executor");
35 36
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                            true,
37 38
                            "Use local_scope in new executor(especially used "
                            "in UT), can turn off for better performance");
39

40
DECLARE_bool(check_nan_inf);
41
DECLARE_bool(benchmark);
42
DECLARE_bool(fast_eager_deletion_mode);
43

44
constexpr const char* kExceptionCaught = "ExceptionCaught";
45
constexpr const char* kTaskCompletion = "TaskCompletion";
46

47 48
namespace paddle {
namespace framework {
49 50
// NOTE(Aurelius84): Need a better strategy to determine it.
static constexpr size_t kHostNumThreads = 4;
51
static constexpr size_t kDeviceNumThreads = 1;
52

53
bool IsInterpretercoreFastGCEnabled() {
54 55 56
  return memory::allocation::AllocatorFacade::Instance()
             .IsStreamSafeCUDAAllocatorUsed() &&
         FLAGS_fast_eager_deletion_mode;
57 58
}

59 60
InterpreterCore::InterpreterCore(const platform::Place& place,
                                 const BlockDesc& block,
61
                                 const std::set<std::string>& skip_gc_vars,
62
                                 VariableScope* global_scope)
W
wanghuancoder 已提交
63
    : place_(place),
64
      block_(block),
65
      skip_gc_vars_(skip_gc_vars),
W
wanghuancoder 已提交
66
      global_scope_(global_scope),
67
      stream_analyzer_(place) {
L
Leo Chen 已提交
68
  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
69

70
  is_build_ = false;
71 72 73 74 75 76 77 78 79 80

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (IsInterpretercoreFastGCEnabled()) {
    gc_ = std::make_unique<InterpreterCoreFastGarbageCollector>();
  } else {
    gc_ = std::make_unique<InterpreterCoreEventGarbageCollector>();
  }
#else
  gc_ = std::make_unique<InterpreterCoreEventGarbageCollector>();
#endif
W
wanghuancoder 已提交
81

L
liutiexing 已提交
82
  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
83
  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
84

85 86 87 88 89 90 91 92
  create_local_scope_ = FLAGS_new_executor_use_local_scope;
  if (FLAGS_new_executor_use_local_scope) {
    auto local_scope = &global_scope->GetMutableScope()->NewScope();
    local_scope->AddListener(global_scope->Listener());
    local_scope_ = local_scope;
  }
  VLOG(4) << "create_local_scope_ is " << create_local_scope_;

W
wanghuancoder 已提交
93 94 95 96 97 98 99
  // prune

  // optmize graph pass

  // convert to run graph
}

100 101 102 103
InterpreterCore::~InterpreterCore() {
  // cancle gc's thread
  gc_.reset(nullptr);

104 105
  async_work_queue_.reset();
  VLOG(4) << "~InterpreterCore(): " << this << " on " << place_;
L
Leo Chen 已提交
106 107 108 109 110 111

#ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
  platform::ClearMKLDNNCache(place_, this);
#endif
112 113
}

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136
interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
  global_scope_->SetLocalScope(local_scope_);
  Prepare(feed_names, feed_tensors, true);
  interpreter::CostInfo cost_info;
  {
    interpreter::ProfilerGuard(place_, &cost_info);

    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
    async_work_queue_ = GetWorkQueue();

    ExecuteInstructionList(vec_instruction_);
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }

  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

  return cost_info;
137 138
}

W
wanghuancoder 已提交
139
paddle::framework::FetchList InterpreterCore::Run(
140
    const std::vector<std::string>& feed_names,
141
    const std::vector<framework::LoDTensor>& feed_tensors) {
L
Leo Chen 已提交
142 143 144
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
145
  bool is_build = is_build_;
146
  global_scope_->SetLocalScope(local_scope_);
147
  Prepare(feed_names, feed_tensors, is_build);
148

149
  if (is_build) {
150 151 152
    // add listener before run and is_build=true
    global_scope_->ResetListener();

153 154 155
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
156
    async_work_queue_ = GetWorkQueue();
157
    ExecuteInstructionList(vec_instruction_);
158 159
  }

160 161 162 163
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

164 165 166
  // clear the listener after run
  global_scope_->ClearListener();

W
wanghuancoder 已提交
167
  // return Fetch Tensors
168
  auto* fetch_var = global_scope_->Var(interpreter::kFetchVarName);
169
  return std::move(*fetch_var->GetMutable<framework::FetchList>());
170 171
}

172 173
paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names) {
L
Leo Chen 已提交
174 175 176
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
177
  if (!is_build_) {
178 179 180 181 182 183 184 185 186 187 188 189
    if (create_local_scope_ &&
        global_scope_->GetMutableLocalScope() !=
            global_scope_->GetMutableScope() &&
        global_scope_->GetMutableLocalScope()) {
      VLOG(4) << "Clear previous local scope before run";
      VLOG(4) << global_scope_->GetMutableScope() << " "
              << global_scope_->GetMutableLocalScope();
      platform::DeviceContextPool::Instance().Get(place_)->Wait();
      // TODO(zhiqiu): clear the tensor holder of all vars in previous local
      // scope?
    }
    global_scope_->SetLocalScope(local_scope_);
190 191
    paddle::framework::interpreter::build_variable_scope(
        block_, global_scope_, create_local_scope_);
192
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
193 194 195 196 197 198
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
                                                       global_scope_,
                                                       create_local_scope_);
199
    is_build_ = true;
200
    SetFeedVarsInplaceSkip(feed_names);
201 202
    // convert vec func_list to graph
    Convert(&op_func_nodes);
203

204
  } else {
205 206 207
    // add listener before run and is_build=true
    global_scope_->ResetListener();

208 209 210
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
211
    async_work_queue_ = GetWorkQueue();
212

213 214 215
    ExecuteInstructionList(vec_instruction_);
  }

216 217 218 219
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

220 221 222
  // clear the listener after run
  global_scope_->ClearListener();

223 224
  // return Fetch Tensors
  auto* fetch_var = global_scope_->Var(interpreter::kFetchVarName);
225
  return std::move(*fetch_var->GetMutable<framework::FetchList>());
226 227
}

228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
  copy_program_ = prog;
}

void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
  async_work_queue_ = src->GetWorkQueue();
  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << &src
          << ") to InterpreterCore(" << this << ")";
}

bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
  if (!global_scope_->VarDesc(var_index)) {
    return input_var2op_info_.at(var_index).size() == 1;
  } else {
    int is_input_cnt = 0;
    for (auto inst_id : input_var2op_info_.at(var_index)) {
      OpInOutInfo info;
      info.Build(vec_instruction_.at(inst_id).OpBase());
      if (info.IsInArgBufferNeeded(global_scope_->VarDesc(var_index)->Name())) {
        is_input_cnt++;
      }
    }
    return is_input_cnt == 1;
  }
}

std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
  if (async_work_queue_ == nullptr) {
    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
        kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
  }
  return async_work_queue_;
}

void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
  VariableValueMap ins_map;
  for (auto& var_name_item : instr_node->Inputs()) {
    std::vector<Variable*> input_vars;

    input_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
      input_vars.emplace_back(global_scope_->Var(id));
    }
    ins_map.emplace(var_name_item.first, std::move(input_vars));
  }

  VariableValueMap outs_map;
  for (auto& var_name_item : instr_node->Outputs()) {
    std::vector<Variable*> out_vars;

    out_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
      out_vars.emplace_back(global_scope_->Var(id));
    }
    outs_map.emplace(var_name_item.first, std::move(out_vars));
  }

  // set runtime_ctx and infershape_ctx_
  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
                                                        // kernel
    Scope* local_scope = create_local_scope_
                             ? global_scope_->GetMutableLocalScope()
                             : global_scope_->GetMutableScope();
    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
  } else {
    instr_node->ResetContext(ins_map, outs_map);
  }
}

void InterpreterCore::BuildInplace() {
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    auto& instr = vec_instruction_[i];
    auto* op_base = instr.OpBase();
    if (!op_base->Info().infer_inplace_) {
      continue;
    }

    auto in_to_outs = op_base->Info().infer_inplace_(
        platform::is_gpu_place(instr.DeviceContext().GetPlace()));

    auto& inputs = instr.Inputs();
    auto& outputs = instr.Outputs();
    for (auto& pair : in_to_outs) {
      auto iter = inputs.find(pair.first);
      if (iter != inputs.end() && !iter->second.empty()) {
        auto in_var_desc = global_scope_->VarDesc(iter->second[0]);
        if (in_var_desc && in_var_desc->Persistable()) {
          continue;
        }
        if (global_scope_->GetVarSikpInplace(iter->second[0])) {
          continue;
        }
        if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
          auto iterout = outputs.find(pair.second);
          if (iterout != outputs.end() && !iterout->second.empty()) {
            auto invar = global_scope_->Var(iter->second[0]);
            auto outvar = global_scope_->Var(iterout->second[0]);
            if (invar && outvar && invar->IsType<LoDTensor>() &&
                outvar->IsType<LoDTensor>()) {
              instr.AddInplace(invar, outvar);
              VLOG(3) << "inplace " << vec_instruction_[i].OpBase()->Type()
                      << " " << global_scope_->GetNameById(iter->second[0])
                      << " -> "
                      << global_scope_->GetNameById(iterout->second[0])
                      << std::endl;
            }
          }
        }
      }
337 338 339 340
    }
  }
}

X
xiongkun 已提交
341 342 343 344 345
void InterpreterCore::BuildOperatorDependences() {
  // analysis the dependences between ops, set the dependecy_count_ and Call
  // Schedule
  auto op_nums = vec_instruction_.size();
  dependecy_count_.resize(op_nums);
346 347
  auto op2downstream = interpreter::build_op_downstream_map(
      vec_instruction_, &op_happens_before_);
X
xiongkun 已提交
348 349 350 351 352 353 354 355 356 357 358
  for (size_t op = 0; op < vec_instruction_.size(); ++op) {
    auto op_list = op2downstream[op];
    std::vector<size_t> downsteam_vector(op_list.begin(), op_list.end());
    stream_analyzer_.Schedule(downsteam_vector, &vec_instruction_, op);

    for (auto inst_id : op_list) {
      dependecy_count_[inst_id]++;
    }
  }
}

359 360 361 362 363 364 365 366 367 368 369 370 371
// At the end of each step, the holder of Tensor in LoDTensorArray is null.
// Clear these Tensors and leave LoDTensorArray empty, otherwise an exception
// will occur in the next step
void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
  auto vars = local_scope_->LocalVars();
  for (auto var : vars) {
    if (var->IsType<LoDTensorArray>()) {
      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
      lod_tensor_arr->clear();
    }
  }
}

L
Leo Chen 已提交
372 373
void InterpreterCore::Convert(
    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
374
  auto& vec_meta_info = global_scope_->MutableVecMetaInfo();
375 376
  auto var_nums = global_scope_->VarSize();
  input_var2op_info_.resize(var_nums);
L
Leo Chen 已提交
377
  auto nodes = *op_func_nodes;
378

L
Leo Chen 已提交
379
  auto op_nums = nodes.size();
380 381
  vec_instruction_.reserve(op_nums);
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
L
Leo Chen 已提交
382
    auto& op_func_node = nodes[op_idx];
383
    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
L
Leo Chen 已提交
384
    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
385
  }
386

387 388 389 390 391
  BuildOperatorDependences();

  // calculate last_live_ops_
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
    auto& instr = vec_instruction_[op_idx];
392
    OpInOutInfo info;
393
    std::set<size_t> gc_check_inputs;
394

395
    for (auto& item : instr.Inputs()) {
396
      for (auto id : item.second) {
W
wanghuancoder 已提交
397 398 399
        if (id == kEmptyVarIndex) {
          continue;
        }
400
        input_var2op_info_.at(id).push_back(op_idx);
W
wanghuancoder 已提交
401 402
        // var can be gc-ed
        if (!info.IsBuilt()) {
403
          info.Build(instr.OpBase());
W
wanghuancoder 已提交
404
        }
405 406 407
        auto* var_desc = global_scope_->VarDesc(id);
        if (var_desc) {
          if (info.IsInArgBufferNeeded(var_desc->Name())) {
408
            gc_check_inputs.insert(id);
W
wanghuancoder 已提交
409 410
          }
        } else {
411
          gc_check_inputs.insert(id);
W
wanghuancoder 已提交
412
        }
413 414
      }
    }
415

416
    for (auto var_id : gc_check_inputs) {
417 418 419
      paddle::framework::Variable* var = global_scope_->Var(var_id);
      if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
          var->IsType<LoDTensorArray>()) {
420
        last_live_ops_[var_id].insert(op_idx);
421 422 423 424 425 426
      } else {
        VLOG(4) << "not clear " << global_scope_->GetNameById(var_id)
                << " after " << instr.OpBase()->Type()
                << " because its type is "
                << framework::ToTypeName(var->Type());
      }
427 428 429
    }
  }

W
wanghuancoder 已提交
430
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
431
    // checkout output
432
    for (auto& item : vec_instruction_[i].Outputs()) {
433 434 435
      for (auto var_id : item.second) {
        if (input_var2op_info_.at(var_id).size() == 0) {
          last_live_ops_[var_id].insert(i);
W
wanghuancoder 已提交
436 437 438 439 440
        }
      }
    }
  }

441 442 443 444 445 446 447 448 449
  // clear the last_live_ops list for all vars in skip_gc_vars
  for (const std::string& skip_gc_var : skip_gc_vars_) {
    int var_id = global_scope_->GetIdByName(skip_gc_var);
    if (var_id != -1) {
      last_live_ops_[var_id].clear();
      VLOG(8) << "Skip gc for var: " << skip_gc_var;
    }
  }

450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480
  // shrink, find the downstream op that has no other op in the
  // downstream list happens before it
  // For example,
  // b = op1(a)
  // c = op2(a, b)
  // in this case, a is the input of op1 and op2, we only need to check
  // a after op2, because op2 always uses a after op1.
  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
    std::set<size_t> minumum_last_live_ops;
    for (size_t item : last_live_ops_[i]) {
      bool not_before_any = true;
      // find the op that is not executed before any
      for (size_t other_item : last_live_ops_[i]) {
        if (op_happens_before_[item][other_item]) {
          VLOG(8) << "happens_before: " << item << "->" << other_item
                  << ", so skip " << item;
          not_before_any = false;
          break;
        }
      }
      if (not_before_any) {
        VLOG(8) << "last live op of var " << i << " "
                << global_scope_->GetNameById(i) << " : " << item << " "
                << vec_instruction_[item].OpBase()->Type();
        minumum_last_live_ops.insert(item);
        vec_instruction_[item].AddGCCheckVar(i);
      }
    }
    last_live_ops_[i] = minumum_last_live_ops;
    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
  }
481 482

  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
483
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
484
  }
W
wanghuancoder 已提交
485

486 487
  BuildSkipShareLoDInfo();

W
wanghuancoder 已提交
488
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
489
    gc_event_.emplace_back(vec_instruction_[i].DeviceContext().GetPlace(),
W
wanghuancoder 已提交
490 491
                           platform::GenerateDeviceEventFlag());
  }
492 493 494 495 496 497 498 499
  bool inplaced = false;
  for (auto inst : vec_instruction_) {
    if (inst.OpBase()->Type() == "share_buffer" ||
        inst.OpBase()->Type() == "share_data") {
      VLOG(4) << "Already inplaced, skip inplace now.";
      inplaced = true;
    }
  }
500

501
  if (FLAGS_new_executor_use_inplace && !inplaced) {
502 503 504
    BuildInplace();
  }

505 506 507 508 509 510 511 512 513 514 515
  // prepare for the first time.
  std::promise<std::unique_ptr<AtomicVectorSizeT>> deps_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_deps_ = deps_promise.get_future();
  deps_promise.set_value(interpreter::PrepareAtomicDeps(dependecy_count_));

  std::promise<std::unique_ptr<AtomicVectorSizeT>> var_ref_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_var_ref_ = var_ref_promise.get_future();
  var_ref_promise.set_value(
      interpreter::PrepareAtomicVarRef(global_scope_->VecMetaInfo()));
516 517
}

518 519 520
void InterpreterCore::BuildSkipShareLoDInfo() {
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    bool can_skip_lod = true;
521
    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
522 523 524 525 526 527 528 529 530 531 532 533
      for (auto& var : input.second) {
        if (var->IsType<LoDTensor>()) {
          if (var->Get<LoDTensor>().lod().size() != 0) {
            can_skip_lod = false;
            break;
          }
        } else {
          can_skip_lod = false;
          break;
        }
      }
    }
534
    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
535 536 537
  }
}

538
void InterpreterCore::RunInstruction(const Instruction& instr_node) {
539 540
  auto* op = instr_node.OpBase();
  auto place = instr_node.DeviceContext().GetPlace();
541 542 543 544
  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(global_scope_);
  Scope* local_scope = create_local_scope_
                           ? global_scope_->GetMutableLocalScope()
                           : global_scope_->GetMutableScope();
545
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
546
  {
547
    // If it is OperatorBase, InferShape do nothing.
548 549
    if (op_with_kernel != nullptr) {
      platform::RecordEvent infershape_event(
550 551 552
          "infer_shape",
          platform::TracerEventType::OperatorInner,
          1,
553
          platform::EventRole::kInnerOp);
554 555 556 557 558 559 560

      // see OperatorWithKernel::RunImpl in operator.cc for why
      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
        op_with_kernel->Info().infer_shape_(
            instr_node.InnerInferShapeContext().get());
      }
561
    }
562
  }
563

564 565
  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
    // TODO(xiongkun03) Does operator base support inplace ?
566
    for (auto& pair : instr_node.InplaceInfo()) {
567 568 569 570 571 572 573 574
      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
      auto* out =
          paddle::framework::details::GetMutableTensorFromVar(pair.second);
      if (in.dims() == out->dims()) {
        out->ShareBufferWith(in);
      }
    }
  }
575

576
  {
577
    platform::RecordEvent compute_event(
578 579 580
        "compute",
        platform::TracerEventType::OperatorInner,
        1,
581
        platform::EventRole::kInnerOp);
582 583 584
    if (op_with_kernel == nullptr) {
      instr_node.OpBase()->Run(*local_scope, place_);
    } else {
585 586 587
      // fit for phi
      if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) {
        VLOG(4) << "Run phi kernel: " << op->Type();
588 589
        VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                << &instr_node.DeviceContext();
590
        phi::KernelContext pt_kernel_context;
591
        op_with_kernel->BuildPhiKernelContext(
592
            *instr_node.InnerRuntimeContext().get(),
593 594
            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
            &pt_kernel_context);
595

596
        (*instr_node.PhiKernel())(&pt_kernel_context);
597 598 599 600

      } else {
        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
      }
601
    }
602
  }
603

604
  VLOG(4) << "End run " << place << " " << op->DebugStringEx(global_scope_);
605

606
  if (!instr_node.InplaceBackMap().empty()) {
L
liutiexing 已提交
607 608
    platform::RecordEvent inplaceback_event(
        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
609 610 611 612 613 614 615 616 617 618 619 620 621 622
    auto& m = instr_node.InplaceBackMap();
    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
    for (auto& p : m) {
      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
          global_scope_->Var(p.first));
      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
          global_scope_->Var(p.second));
      original_tensor->ShareDataWith(*transformed_tensor);
      VLOG(4) << "Transfer inplace variable back form "
              << global_scope_->GetNameById(p.first) << " to "
              << global_scope_->GetNameById(p.second);
    }
  }

623 624 625
  /*For profiling/benchmark only*/
  if (FLAGS_benchmark) {
    instr_node.DeviceContext().Wait();
626 627
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
628 629 630 631 632
    VLOG(4) << "Operator(" << op->Type()
            << "): context wait and get last error";
#endif
  }

633
  // for debug nan/inf
634
  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
635
    VLOG(4) << "Check nan/inf";
636
    framework::details::CheckOpHasNanOrInf(
637 638
        *op,
        *global_scope_,
639
        place);  // TODO(xiongkun03) change it to inner scope.
640
  }
641 642 643
}

void InterpreterCore::ExecuteInstructionList(
644
    const std::vector<Instruction>& vec_instr) {
645 646 647 648 649 650
  unfinished_op_numer_ = vec_instr.size();
  if (unfinished_op_numer_ == 0) {
    VLOG(4) << "No op to run, return";
    return;
  }

L
liutiexing 已提交
651 652
  platform::RecordEvent record_prepare(
      "PrepareAtomic", platform::TracerEventType::UserDefined, 1);
653 654
  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
  // those for the next step
655 656
  auto atomic_deps = atomic_deps_.get();
  auto atomic_var_ref = atomic_var_ref_.get();
657

658 659 660
  atomic_deps_ = async_work_queue_->PrepareAtomicDeps(dependecy_count_);
  atomic_var_ref_ =
      async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
L
liutiexing 已提交
661
  record_prepare.End();
662

663 664
  exception_holder_.Clear();

665 666
  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
    if (dependecy_count_[i] == 0) {
667
      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
668 669 670
                                 [this,
                                  i,
                                  atomic_deps = atomic_deps.get(),
671
                                  atomic_var_ref = atomic_var_ref.get()] {
672 673
                                   RunInstructionAsync(
                                       i, atomic_deps, atomic_var_ref);
674
                                 });
675 676 677
    }
  }

678
  auto event_name = main_thread_blocker_.WaitEvent();
679 680
  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
          << ") got event_name: " << event_name;
681

682
  if (UNLIKELY(exception_holder_.IsCaught())) {
683
    VLOG(1) << "Exception caught " << exception_holder_.Type();
684 685 686 687 688
    // Graceful exit when the executor encountered a fatal error.
    // EOF is not a fatal error.
    if (exception_holder_.Type() != "EOF") {
      async_work_queue_->Cancel();
    }
689
    VLOG(4) << "Cancel ok";
690
    PADDLE_ENFORCE_EQ(
691 692
        main_thread_blocker_.Clear(),
        0,
693 694
        platform::errors::PreconditionNotMet(
            "main_thread_blocker_.Clear() return -1, clear failed"));
695
    VLOG(4) << "clear ok";
696 697
    exception_holder_.ReThrow();
  }
698
}
699

L
liutiexing 已提交
700
void InterpreterCore::RunNextInstructions(
701 702
    const Instruction& instr,
    std::queue<size_t>* reserved_next_ops,
703 704
    std::vector<std::atomic<size_t>>* atomic_deps,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
705 706
  platform::RecordEvent record(
      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
L
liutiexing 已提交
707
  VLOG(4) << "atomic 1:" << atomic_deps;
708
  auto& next_instr = instr.NextInstructions();
709 710

  auto IsReady = [atomic_deps](size_t next_id) {
711 712
    VLOG(4) << "atomic:" << atomic_deps << " op_id: " << next_id
            << ", remain deps: " << (*atomic_deps)[next_id];
713
    return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
714 715
  };

716
  if (instr.KernelType() == OpFuncType::kQueueAsync) {
717
    // move all sync_ops into other threads
718
    for (auto next_id : next_instr.SyncRunIds()) {
719
      if (IsReady(next_id)) {
720
        async_work_queue_->AddTask(
721
            vec_instruction_[next_id].KernelType(),
722 723 724
            [this, next_id, atomic_deps, atomic_var_ref]() {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
725 726 727
      }
    }
    // keep all async_ops running in current thread
728
    for (auto next_id : next_instr.DirectRunIds()) {
729
      if (IsReady(next_id)) {
L
liutiexing 已提交
730
        reserved_next_ops->push(next_id);
731 732
      }
    }
733
    for (auto next_id : next_instr.EventRunIds()) {
734
      if (IsReady(next_id)) {
L
liutiexing 已提交
735
        reserved_next_ops->push(next_id);
736 737 738 739
      }
    }
  } else {
    // move async_ops into async_thread
740
    for (auto next_id : next_instr.EventRunIds()) {
741
      if (IsReady(next_id)) {
742
        async_work_queue_->AddTask(
743
            vec_instruction_[next_id].KernelType(),
744 745 746
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
747 748
      }
    }
749 750
    auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
                                                    next_instr.DirectRunIds());
751 752
    size_t first_op = 0;
    for (auto next_id : direct_run_ops) {
753 754
      if (IsReady(next_id)) {
        // only keep one op running in current thread
755 756
        if (first_op == 0) {
          first_op = next_id;
757 758 759
          continue;
        }
        // move rest ops into other threads
760
        async_work_queue_->AddTask(
761
            vec_instruction_[next_id].KernelType(),
762 763 764
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
765 766
      }
    }
L
liutiexing 已提交
767
    if (first_op != 0) reserved_next_ops->push(first_op);
768 769 770
  }
}

771
void InterpreterCore::RunInstructionAsync(
772 773
    size_t instr_id,
    std::vector<std::atomic<size_t>>* atomic_deps,
774
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
L
liutiexing 已提交
775 776 777 778 779
  std::queue<size_t> ready_ops;
  ready_ops.push(instr_id);
  while (!ready_ops.empty()) {
    instr_id = ready_ops.front();
    ready_ops.pop();
780
    auto& instr_node = vec_instruction_.at(instr_id);
L
liutiexing 已提交
781
    VLOG(5) << __func__ << " OP id:" << instr_node.Id()
782 783 784 785
            << " name:" << instr_node.OpBase()->Type() << " type:"
            << (instr_node.KernelType() == OpFuncType::kQueueSync
                    ? "kQueueSync"
                    : "kQueueAsync")
L
liutiexing 已提交
786 787
            << " runs on " << platform::GetCurrentThreadName();

788
    auto* op = instr_node.OpBase();
L
liutiexing 已提交
789 790
    platform::RecordEvent instruction_event(
        op->Type(), platform::TracerEventType::Operator, 1);
791

792
    try {
793 794
      interpreter::WaitEvent(instr_node, place_);

795
      RunInstruction(instr_node);
796 797 798 799

#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      RecordStreamForGC(instr_node);
#endif
800
      CheckGC(instr_node, atomic_var_ref);
801 802

      interpreter::RecordEvent(instr_node, place_);
803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
    } catch (platform::EnforceNotMet& ex) {
      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
      exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
    } catch (platform::EOFException&) {
      exception_holder_.Catch(std::current_exception());
    } catch (std::exception& ex) {
      LOG(WARNING) << op->Type() << " raises an exception "
                   << platform::demangle(typeid(ex).name()) << ", "
                   << ex.what();
      exception_holder_.Catch(std::current_exception());
    } catch (...) {
      LOG(WARNING) << op->Type() << " raises an unknown exception";
      exception_holder_.Catch(std::current_exception());
    }

    if (UNLIKELY(exception_holder_.IsCaught())) {
      VLOG(4) << "Exception caught";
      if (exception_notifier_ != nullptr) {
        exception_notifier_->NotifyEvent();
      }
      return;
    }
825

826 827 828 829 830 831 832 833
    VLOG(4) << "unfinished_op_numer_: " << unfinished_op_numer_;
    if (UNLIKELY(unfinished_op_numer_.fetch_sub(1, std::memory_order_relaxed) ==
                 1)) {
      if (completion_notifier_ != nullptr) {
        completion_notifier_->NotifyEvent();
      }
    }

834
    RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
L
liutiexing 已提交
835
  }
836 837
}

838 839 840 841 842 843
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
  if (!IsInterpretercoreFastGCEnabled() ||
      instr.KernelType() != OpFuncType::kQueueAsync) {
    return;
  }
844 845
  platform::RecordEvent record(
      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914

  gpuStream_t stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                           instr.DeviceContext())
                           .stream();
  auto TensorRecordStream = [&stream](Tensor& tensor) {
    auto allocation = tensor.Holder();
    if (allocation == nullptr) {
      return;
    }

    const platform::Place& place = allocation->place();
    if (platform::is_gpu_place(place)) {
      memory::RecordStream(allocation, stream);
    } else if (platform::is_cuda_pinned_place(place)) {
      // TODO(Ruibiao): Here should do something to make sure that the tensor is
      // not freed until the H2D copies done. However, simplely launch a CUDA
      // runtime callback to the H2D stream may lead a high performance
      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
      // present, we just log a WARNING here. A better design is required.
      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
                      "manner may lead a data inconsistent";
    } else {
      // memory copies involve CPUPlace are always synchronous, so just do
      // nothing here
    }
  };

  /* NOTE(Ruibiao):Cross-stream tensor synchronization is required only when
   * all the following conditions are satisfied:
   * 1. The tensor will be GC after running the instruction, i.e., in
   * instr.GCCheckVars.
   * 2. The stream which initializes this tensor is different from the stream
   * which the instruction run in.
   * 3. The tensor is the instruction's input, cause we assume that instruction
   * will initialize all output tensors with its running stream.
   * 4. In the OP function of this instruction, the tensor is an input of a
   * async CUDA kernel.
   *
   * Here we only process the first condition, because:
   * 1. Since the RecordStream function will directly return when the recored
   * stream is equal to the owning stream, recording a stream same as which
   * initialized this tensor has less time overhead. Conversely, it may take
   * more time if we try to extract those cross-stream input vars from
   * instr.GCCheckVars.
   * 2. Now the instruction has no idea of which vars involving async running in
   * OP function, and thus we can not recognize condition 4. It should be
   * supported later.
   */
  for (int var_id : instr.GCCheckVars()) {
    VLOG(4) << "GC sync " << global_scope_->GetNameById(var_id) << " "
            << global_scope_->VarDesc(var_id);

    // persistable var will be ignore while GC
    if (global_scope_->VarDesc(var_id) &&
        global_scope_->VarDesc(var_id)->Persistable()) {
      continue;
    }

    paddle::framework::Variable* var = global_scope_->Var(var_id);
    if (var == nullptr) {
      continue;
    }

    if (var->IsType<LoDTensor>()) {
      TensorRecordStream(*(var->GetMutable<LoDTensor>()));
    } else if (var->IsType<
                   operators::reader::
                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
      // do nothing
915
    } else if (var->IsType<phi::SelectedRows>()) {
916
      TensorRecordStream(
917
          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933
    } else if (var->IsType<LoDTensorArray>()) {
      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
      for (auto& tensor : *tensor_arr) {
        TensorRecordStream(tensor);
      }
    } else if (var->IsType<std::vector<Scope*>>()) {
      // do nothing
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "The variable(%s) is not supported in eager deletion.",
          framework::ToTypeName(var->Type())));
    }
  }
}
#endif

934 935 936
void InterpreterCore::CheckGC(
    const Instruction& instr,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
937 938
  platform::RecordEvent record(
      "CheckGC", platform::TracerEventType::UserDefined, 10);
939
  size_t instr_id = instr.Id();
940 941
  auto& var_scope = *global_scope_;

942
  for (auto var_id : instr.GCCheckVars()) {
L
Leo Chen 已提交
943 944
    VLOG(4) << "GC " << global_scope_->GetNameById(var_id) << " "
            << var_scope.VarDesc(var_id);
945 946
    VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
            << " " << var_id;
947
    bool is_ready =
948
        (*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
949 950 951 952 953
    // ignore all persistable var while GC
    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
      continue;
    }
    if (is_ready) {
X
xiongkun 已提交
954 955
      VLOG(6) << "Async delete variable with name : "
              << var_scope.GetNameById(var_id);
956 957 958 959 960 961 962
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      if (IsInterpretercoreFastGCEnabled()) {
        static_cast<InterpreterCoreFastGarbageCollector*>(gc_.get())->Add(
            var_scope.Var(var_id));

      } else {
        static_cast<InterpreterCoreEventGarbageCollector*>(gc_.get())->Add(
963 964
            var_scope.Var(var_id),
            &gc_event_.at(instr_id),
965 966 967 968
            &instr.DeviceContext());
      }
#else
      static_cast<InterpreterCoreEventGarbageCollector*>(gc_.get())->Add(
969 970
          var_scope.Var(var_id),
          &gc_event_.at(instr_id),
971 972
          &instr.DeviceContext());
#endif
W
wanghuancoder 已提交
973 974 975 976
    }
  }
}

977 978
void InterpreterCore::Prepare(
    const std::vector<std::string>& feed_names,
979 980 981 982
    const std::vector<framework::LoDTensor>& feed_tensors,
    bool prepare_feed) {
  PADDLE_ENFORCE_EQ(feed_names.size(),
                    feed_tensors.size(),
983 984 985
                    platform::errors::PreconditionNotMet(
                        "Required feed_names.size() == feed_tensors.size(), "
                        "but received %d != %d",
986 987
                        feed_names.size(),
                        feed_tensors.size()));
988

989
  auto FeedInput = [&] {
990
    VLOG(4) << "Feed inputs";
991 992
    for (size_t i = 0; i < feed_names.size(); ++i) {
      auto* feed_var = global_scope_->FindVar(feed_names[i]);
993
      PADDLE_ENFORCE_NOT_NULL(
994 995 996
          feed_var,
          platform::errors::NotFound("Variable %s should not be nullptr.",
                                     feed_names[i]));
997

998
      auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
999
      feed_tensor->ShareDataWith(feed_tensors[i]);
1000
      feed_tensor->set_lod(feed_tensors[i].lod());
1001 1002 1003
    }
  };

1004
  if (!is_build_) {
1005 1006
    paddle::framework::interpreter::build_variable_scope(
        block_, global_scope_, create_local_scope_);
1007
    FeedInput();
L
Leo Chen 已提交
1008
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
1009 1010 1011 1012 1013 1014
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
                                                       global_scope_,
                                                       create_local_scope_);
1015
    is_build_ = true;
1016
    SetFeedVarsInplaceSkip(feed_names);
1017
    // convert vec func_list to graph
L
Leo Chen 已提交
1018
    Convert(&op_func_nodes);
1019
  }
W
wanghuancoder 已提交
1020 1021
  // NOTE: Because feed_tensor will be GC after
  // paddle::framework::build_op_func_list, so we should
1022
  // call FeedInput again.
1023 1024 1025
  if (prepare_feed) {
    FeedInput();
  }
1026 1027
}

1028 1029 1030 1031 1032 1033 1034
void InterpreterCore::SetFeedVarsInplaceSkip(
    const std::vector<std::string>& feed_names) {
  for (auto& feed_name : feed_names) {
    global_scope_->SetVarSikpInplace(feed_name, true);
  }
}

1035
std::shared_ptr<InterpreterCore> CreateInterpreterCore(
1036 1037 1038 1039
    const platform::Place& place,
    const ProgramDesc& prog,
    VariableScope* global_scope,
    const std::vector<std::string>& fetch_names,
1040 1041 1042 1043 1044 1045 1046 1047
    const std::set<std::string>& skip_gc_vars) {
  std::shared_ptr<InterpreterCore> core = nullptr;
  // NOTE(Aurelius84): `add_fetch` will modify BlockDesc, so we should copy
  // a new program.
  auto new_prog = std::make_shared<framework::ProgramDesc>(prog);
  auto* block = new_prog->MutableBlock(0);
  interpreter::add_fetch(fetch_names, block);

1048 1049
  core = std::make_shared<InterpreterCore>(
      place, *block, skip_gc_vars, global_scope);
1050 1051 1052 1053
  core->SetCopyProgram(new_prog);
  return core;
}

1054 1055
}  // namespace framework
}  // namespace paddle