interpretercore.cc 39.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
14

15
#include "paddle/fluid/framework/new_executor/interpretercore.h"
16

17
#include <unordered_set>
18

19
#include "paddle/fluid/framework/details/nan_inf_utils.h"
20
#include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
21 22
#include "paddle/fluid/framework/new_executor/interpretercore_util.h"
#include "paddle/fluid/framework/operator.h"
L
liutiexing 已提交
23 24
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
25
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
26
#include "paddle/phi/common/place.h"
27
#include "paddle/phi/core/kernel_context.h"
L
Leo Chen 已提交
28 29 30
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
31
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
32

33
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
34
                            false,
35
                            "Use inplace in new executor");
36 37
PADDLE_DEFINE_EXPORTED_bool(new_executor_use_local_scope,
                            true,
38 39
                            "Use local_scope in new executor(especially used "
                            "in UT), can turn off for better performance");
40

41
DECLARE_bool(check_nan_inf);
42
DECLARE_bool(benchmark);
43

44
constexpr const char* kExceptionCaught = "ExceptionCaught";
45
constexpr const char* kTaskCompletion = "TaskCompletion";
46

47 48
namespace paddle {
namespace framework {
49 50
// NOTE(Aurelius84): Need a better strategy to determine it.
static constexpr size_t kHostNumThreads = 4;
51
static constexpr size_t kDeviceNumThreads = 1;
52

53 54
InterpreterCore::InterpreterCore(const platform::Place& place,
                                 const BlockDesc& block,
55
                                 const std::set<std::string>& skip_gc_vars,
56 57
                                 framework::Scope* scope,
                                 bool used_for_jit)
W
wanghuancoder 已提交
58
    : place_(place),
59
      block_(block),
60
      skip_gc_vars_(skip_gc_vars),
61
      var_scope_(scope),
62 63
      stream_analyzer_(place),
      used_for_jit_(used_for_jit) {
L
Leo Chen 已提交
64
  VLOG(4) << "InterpreterCore(): " << this << " on " << place_;
65

66
  is_build_ = false;
67

L
liutiexing 已提交
68
  exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught);
69
  completion_notifier_ = main_thread_blocker_.RegisterEvent(kTaskCompletion);
70

71
  create_local_scope_ = FLAGS_new_executor_use_local_scope;
72 73 74 75

  if (used_for_jit_) {
    create_local_scope_ = false;
  }
76 77 78 79
  VLOG(4) << "create_local_scope_ is " << create_local_scope_;

  if (create_local_scope_) {
    auto local_scope = &var_scope_.GetMutableScope()->NewScope();
80 81
    local_scope_ = local_scope;
  }
82
  var_scope_.SetLocalScope(local_scope_);
83

W
wanghuancoder 已提交
84 85 86 87 88 89 90
  // prune

  // optmize graph pass

  // convert to run graph
}

91 92 93
InterpreterCore::~InterpreterCore() {
  // cancle gc's thread
  gc_.reset(nullptr);
94 95
  async_work_queue_.reset();
  VLOG(4) << "~InterpreterCore(): " << this << " on " << place_;
L
Leo Chen 已提交
96 97 98 99 100 101

#ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
  platform::ClearMKLDNNCache(place_, this);
#endif
102 103
}

104 105 106
interpreter::CostInfo InterpreterCore::DryRun(
    const std::vector<std::string>& feed_names,
    const std::vector<framework::LoDTensor>& feed_tensors) {
107 108 109 110 111
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
112 113 114 115 116 117 118 119 120 121
  Prepare(feed_names, feed_tensors, true);
  interpreter::CostInfo cost_info;
  {
    interpreter::ProfilerGuard(place_, &cost_info);

    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
    async_work_queue_ = GetWorkQueue();

L
Leo Chen 已提交
122 123 124 125 126
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

127 128 129 130 131 132 133 134 135
    ExecuteInstructionList(vec_instruction_);
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
  }

  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

  return cost_info;
136 137
}

W
wanghuancoder 已提交
138
paddle::framework::FetchList InterpreterCore::Run(
139
    const std::vector<std::string>& feed_names,
140
    const std::vector<framework::LoDTensor>& feed_tensors) {
141 142 143 144 145
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
L
Leo Chen 已提交
146 147 148
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
149 150
  bool is_build = is_build_;
  Prepare(feed_names, feed_tensors, is_build);
151

152
  if (is_build) {
153 154 155
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
156
    async_work_queue_ = GetWorkQueue();
L
Leo Chen 已提交
157 158 159 160 161 162

    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

163
    ExecuteInstructionList(vec_instruction_);
164 165 166
#ifdef PADDLE_WITH_ASCEND_CL
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif
167
  }
168 169 170 171
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }

W
wanghuancoder 已提交
172
  // return Fetch Tensors
173 174 175 176 177 178
  auto* fetch_var = local_scope_->FindVar(interpreter::kFetchVarName);
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
179 180
}

181 182
paddle::framework::FetchList InterpreterCore::Run(
    const std::vector<std::string>& feed_names) {
183 184 185 186 187
#if defined(PADDLE_WITH_CUDA) && defined(PADDLE_WITH_HETERPS)
  if (platform::is_gpu_place(place_)) {
    platform::SetDeviceId(place_.device);
  }
#endif
L
Leo Chen 已提交
188 189 190
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
#endif
191
  if (!is_build_) {
192 193
    paddle::framework::interpreter::build_variable_scope(
        block_, &var_scope_, create_local_scope_);
194

195
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
196 197 198 199
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
200
                                                       &var_scope_,
201 202
                                                       create_local_scope_,
                                                       used_for_jit_);
203
    is_build_ = true;
204
    SetFeedVarsInplaceSkip(feed_names);
205 206 207
    // convert vec func_list to graph
    Convert(&op_func_nodes);
  } else {
208 209 210
    // For the program that only run once, it is no need to
    // create work_queue, so the async_work_queue_ is created
    // until the second step run.
211
    async_work_queue_ = GetWorkQueue();
212

L
Leo Chen 已提交
213 214 215 216 217
    // lazy initialization of gc, do not create gc is the program only run once
    if (!gc_) {
      gc_ = CreateInterpreterCoreGarbageCollector(place_, vec_instruction_);
    }

218
    ExecuteInstructionList(vec_instruction_);
219 220 221
#ifdef PADDLE_WITH_ASCEND_CL
    platform::DeviceContextPool::Instance().Get(place_)->Wait();
#endif
222 223
  }

224 225 226
  if (create_local_scope_) {
    ClearLoDTensorArrayInLocalScope();
  }
227
  // return Fetch Tensors
228 229 230
  Scope* inner_scope =
      create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
  auto* fetch_var = inner_scope->FindVar(interpreter::kFetchVarName);
231 232 233 234 235
  if (fetch_var) {
    return std::move(*fetch_var->GetMutable<framework::FetchList>());
  } else {
    return {};
  }
236 237
}

238 239 240 241
void InterpreterCore::SetCopyProgram(std::shared_ptr<ProgramDesc> prog) {
  copy_program_ = prog;
}

242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266
void InterpreterCore::SetSkipGcVars(const std::set<std::string>& skip_gc_vars) {
  PADDLE_ENFORCE_EQ(
      skip_gc_vars_.empty(),
      true,
      platform::errors::PreconditionNotMet(
          "Skip_gc_vars_ can only be initialized once, now skip_gc_vars_ is "
          "not empty, do not call SetSkipGcVars method repeatedly."));
  skip_gc_vars_ = skip_gc_vars;
}

const VariableScope* InterpreterCore::GetVariableScope() const {
  return &var_scope_;
}

void InterpreterCore::reset_scope(Scope* new_scope) {
  var_scope_.SetScope(new_scope);
  auto& var_list = var_scope_.MutableVarList();
  for (size_t i = 0; i < var_list.size(); i++) {
    var_list[i] = new_scope->FindVar(var_scope_.GetNameById(i));
  }
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
  }
}

267 268 269 270 271 272 273
void InterpreterCore::ShareWorkQueueFrom(std::shared_ptr<InterpreterCore> src) {
  async_work_queue_ = src->GetWorkQueue();
  VLOG(8) << "Share AsyncWorkQueue from InterpreterCore(" << &src
          << ") to InterpreterCore(" << this << ")";
}

bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
274
  if (!var_scope_.VarDesc(var_index)) {
275 276 277 278 279 280
    return input_var2op_info_.at(var_index).size() == 1;
  } else {
    int is_input_cnt = 0;
    for (auto inst_id : input_var2op_info_.at(var_index)) {
      OpInOutInfo info;
      info.Build(vec_instruction_.at(inst_id).OpBase());
281
      if (info.IsInArgBufferNeeded(var_scope_.VarDesc(var_index)->Name())) {
282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297
        is_input_cnt++;
      }
    }
    return is_input_cnt == 1;
  }
}

std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
  if (async_work_queue_ == nullptr) {
    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
        kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
  }
  return async_work_queue_;
}

void InterpreterCore::BuildAndCacheInstructionCtx(Instruction* instr_node) {
298 299
  Scope* inner_scope =
      create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
300 301 302 303 304 305
  VariableValueMap ins_map;
  for (auto& var_name_item : instr_node->Inputs()) {
    std::vector<Variable*> input_vars;

    input_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
306
      input_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
307 308 309 310 311 312 313 314 315 316
    }
    ins_map.emplace(var_name_item.first, std::move(input_vars));
  }

  VariableValueMap outs_map;
  for (auto& var_name_item : instr_node->Outputs()) {
    std::vector<Variable*> out_vars;

    out_vars.reserve(var_name_item.second.size());
    for (auto& id : var_name_item.second) {
317
      out_vars.emplace_back(inner_scope->FindVar(var_scope_.GetNameById(id)));
318 319 320 321 322 323 324
    }
    outs_map.emplace(var_name_item.first, std::move(out_vars));
  }

  // set runtime_ctx and infershape_ctx_
  if (instr_node->OpBase()->Type() == "cinn_launch") {  // OP use scope in
                                                        // kernel
325 326
    Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                             : var_scope_.GetMutableScope();
327 328 329 330 331 332 333
    instr_node->ResetContextWithScope(ins_map, outs_map, *local_scope);
  } else {
    instr_node->ResetContext(ins_map, outs_map);
  }
}

void InterpreterCore::BuildInplace() {
334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
  // NOTE(Ruibiao): coalesce_tensor_op outputs a FusedOutput Tensor and a list
  // of Output Tensors which are sliced from the FusedOutput. These outputs
  // sholud not be the outvar of the in-place var-pair since memory reuse
  // between FusedOutput and Output Tensors is assumed. For the following
  // example:
  // fused_var, var1, var2, var3 = coalesce_tensor(var1, var2, var3)
  // var1 = sum(var4, var5)
  // ...
  //
  // After running coalesce_tensor_op, var1 is assumed to share the buffer
  // slices from fused_var. However, if sum_op is in-place, then var1 would
  // re-share the buffer with var4 instead of fused_var.
  std::set<std::string> skip_inplace_outvars;
  for (Instruction& instr : vec_instruction_) {
    OperatorBase* op = instr.OpBase();
    if (op->Type() == "coalesce_tensor") {
      const std::vector<std::string>& outputs =
          op->OutputVars(/*has_intermediate=*/false);
      skip_inplace_outvars.insert(outputs.begin(), outputs.end());
    }
  }

356 357 358
  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                           : var_scope_.GetMutableScope();

359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    auto& instr = vec_instruction_[i];
    auto* op_base = instr.OpBase();
    if (!op_base->Info().infer_inplace_) {
      continue;
    }

    auto in_to_outs = op_base->Info().infer_inplace_(
        platform::is_gpu_place(instr.DeviceContext().GetPlace()));

    auto& inputs = instr.Inputs();
    auto& outputs = instr.Outputs();
    for (auto& pair : in_to_outs) {
      auto iter = inputs.find(pair.first);
      if (iter != inputs.end() && !iter->second.empty()) {
374
        auto in_var_desc = var_scope_.VarDesc(iter->second[0]);
375 376 377
        if (in_var_desc && in_var_desc->Persistable()) {
          continue;
        }
378
        if (var_scope_.GetVarSikpInplace(iter->second[0])) {
379 380 381 382 383
          continue;
        }
        if (BuildInplaceCheckVarIsOnlyInput(iter->second[0])) {
          auto iterout = outputs.find(pair.second);
          if (iterout != outputs.end() && !iterout->second.empty()) {
384 385 386 387
            const std::string& invar_name =
                var_scope_.GetNameById(iter->second[0]);
            const std::string& outvar_name =
                var_scope_.GetNameById(iterout->second[0]);
388 389
            auto invar = local_scope->FindVar(invar_name);
            auto outvar = local_scope->FindVar(outvar_name);
390

391
            if (invar && outvar && invar->IsType<LoDTensor>() &&
392 393 394
                outvar->IsType<LoDTensor>() &&
                skip_inplace_outvars.find(outvar_name) ==
                    skip_inplace_outvars.end()) {
395
              instr.AddInplace(invar, outvar);
396 397
              VLOG(3) << "inplace " << op_base->Type() << " " << invar_name
                      << " -> " << outvar_name;
398 399 400 401
            }
          }
        }
      }
402 403 404 405
    }
  }
}

X
xiongkun 已提交
406 407 408 409 410
void InterpreterCore::BuildOperatorDependences() {
  // analysis the dependences between ops, set the dependecy_count_ and Call
  // Schedule
  auto op_nums = vec_instruction_.size();
  dependecy_count_.resize(op_nums);
411
  auto op2downstream = dependency_builder_.Build(vec_instruction_);
X
xiongkun 已提交
412 413 414 415 416 417 418 419 420 421 422
  for (size_t op = 0; op < vec_instruction_.size(); ++op) {
    auto op_list = op2downstream[op];
    std::vector<size_t> downsteam_vector(op_list.begin(), op_list.end());
    stream_analyzer_.Schedule(downsteam_vector, &vec_instruction_, op);

    for (auto inst_id : op_list) {
      dependecy_count_[inst_id]++;
    }
  }
}

423 424 425 426 427 428 429 430 431 432 433 434 435
// At the end of each step, the holder of Tensor in LoDTensorArray is null.
// Clear these Tensors and leave LoDTensorArray empty, otherwise an exception
// will occur in the next step
void InterpreterCore::ClearLoDTensorArrayInLocalScope() {
  auto vars = local_scope_->LocalVars();
  for (auto var : vars) {
    if (var->IsType<LoDTensorArray>()) {
      auto* lod_tensor_arr = var->GetMutable<LoDTensorArray>();
      lod_tensor_arr->clear();
    }
  }
}

L
Leo Chen 已提交
436 437
void InterpreterCore::Convert(
    std::vector<paddle::framework::OpFuncNode>* op_func_nodes) {
438 439
  auto& vec_meta_info = var_scope_.MutableVecMetaInfo();
  auto var_nums = var_scope_.VarSize();
440
  input_var2op_info_.resize(var_nums);
L
Leo Chen 已提交
441
  auto nodes = *op_func_nodes;
442

L
Leo Chen 已提交
443
  auto op_nums = nodes.size();
444 445
  vec_instruction_.reserve(op_nums);
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
L
Leo Chen 已提交
446
    auto& op_func_node = nodes[op_idx];
447
    auto* dev_ctx_ = stream_analyzer_.ParseDeviceContext(op_func_node);
L
Leo Chen 已提交
448
    vec_instruction_.emplace_back(op_idx, std::move(op_func_node), *dev_ctx_);
449 450 451 452 453
  }
  BuildOperatorDependences();
  // calculate last_live_ops_
  for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) {
    auto& instr = vec_instruction_[op_idx];
454
    OpInOutInfo info;
455 456
    std::set<size_t> gc_check_inputs;
    for (auto& item : instr.Inputs()) {
457
      for (auto id : item.second) {
W
wanghuancoder 已提交
458 459 460
        if (id == kEmptyVarIndex) {
          continue;
        }
461
        input_var2op_info_.at(id).push_back(op_idx);
W
wanghuancoder 已提交
462 463
        // var can be gc-ed
        if (!info.IsBuilt()) {
464
          info.Build(instr.OpBase());
W
wanghuancoder 已提交
465
        }
466
        auto* var_desc = var_scope_.VarDesc(id);
467 468
        if (var_desc) {
          if (info.IsInArgBufferNeeded(var_desc->Name())) {
469
            gc_check_inputs.insert(id);
W
wanghuancoder 已提交
470 471
          }
        } else {
472
          gc_check_inputs.insert(id);
W
wanghuancoder 已提交
473
        }
474 475
      }
    }
476
    for (auto var_id : gc_check_inputs) {
477 478
      Scope* inner_scope =
          create_local_scope_ ? local_scope_ : var_scope_.GetMutableScope();
479
      paddle::framework::Variable* var =
480
          inner_scope->FindVar(var_scope_.GetNameById(var_id));
481 482
      if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>() ||
          var->IsType<LoDTensorArray>()) {
483
        last_live_ops_[var_id].insert(op_idx);
484
      } else {
485 486
        VLOG(4) << "not clear " << var_scope_.GetNameById(var_id) << " after "
                << instr.OpBase()->Type() << " because its type is "
487 488
                << framework::ToTypeName(var->Type());
      }
489 490
    }
  }
W
wanghuancoder 已提交
491
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
492
    // checkout output
493
    for (auto& item : vec_instruction_[i].Outputs()) {
494 495 496
      for (auto var_id : item.second) {
        if (input_var2op_info_.at(var_id).size() == 0) {
          last_live_ops_[var_id].insert(i);
W
wanghuancoder 已提交
497 498 499 500
        }
      }
    }
  }
501 502
  // clear the last_live_ops list for all vars in skip_gc_vars
  for (const std::string& skip_gc_var : skip_gc_vars_) {
503
    int var_id = var_scope_.GetIdByName(skip_gc_var);
504 505 506 507 508 509
    if (var_id != -1) {
      last_live_ops_[var_id].clear();
      VLOG(8) << "Skip gc for var: " << skip_gc_var;
    }
  }

510 511 512 513 514 515 516 517 518 519 520 521 522
  // shrink, find the downstream op that has no other op in the
  // downstream list happens before it
  // For example,
  // b = op1(a)
  // c = op2(a, b)
  // in this case, a is the input of op1 and op2, we only need to check
  // a after op2, because op2 always uses a after op1.
  for (size_t i = 0; i < last_live_ops_.size(); ++i) {
    std::set<size_t> minumum_last_live_ops;
    for (size_t item : last_live_ops_[i]) {
      bool not_before_any = true;
      // find the op that is not executed before any
      for (size_t other_item : last_live_ops_[i]) {
523
        if (dependency_builder_.OpHappensBefore(item, other_item)) {
524 525 526 527 528 529 530 531
          VLOG(8) << "happens_before: " << item << "->" << other_item
                  << ", so skip " << item;
          not_before_any = false;
          break;
        }
      }
      if (not_before_any) {
        VLOG(8) << "last live op of var " << i << " "
532
                << var_scope_.GetNameById(i) << " : " << item << " "
533 534 535 536 537 538 539 540
                << vec_instruction_[item].OpBase()->Type();
        minumum_last_live_ops.insert(item);
        vec_instruction_[item].AddGCCheckVar(i);
      }
    }
    last_live_ops_[i] = minumum_last_live_ops;
    vec_meta_info[i].var_ref_count_ = last_live_ops_[i].size();
  }
541 542

  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
543
    BuildAndCacheInstructionCtx(&vec_instruction_[i]);
544
  }
W
wanghuancoder 已提交
545

546
  BuildSkipShareLoDInfo();
L
Leo Chen 已提交
547

548 549 550 551 552 553 554 555
  bool inplaced = false;
  for (auto inst : vec_instruction_) {
    if (inst.OpBase()->Type() == "share_buffer" ||
        inst.OpBase()->Type() == "share_data") {
      VLOG(4) << "Already inplaced, skip inplace now.";
      inplaced = true;
    }
  }
556

557
  if (FLAGS_new_executor_use_inplace && !inplaced) {
558 559 560
    BuildInplace();
  }

561 562 563 564 565 566 567 568 569 570
  // prepare for the first time.
  std::promise<std::unique_ptr<AtomicVectorSizeT>> deps_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_deps_ = deps_promise.get_future();
  deps_promise.set_value(interpreter::PrepareAtomicDeps(dependecy_count_));

  std::promise<std::unique_ptr<AtomicVectorSizeT>> var_ref_promise =
      std::promise<std::unique_ptr<AtomicVectorSizeT>>();
  atomic_var_ref_ = var_ref_promise.get_future();
  var_ref_promise.set_value(
571
      interpreter::PrepareAtomicVarRef(var_scope_.VecMetaInfo()));
572 573
}

574 575 576
void InterpreterCore::BuildSkipShareLoDInfo() {
  for (size_t i = 0; i < vec_instruction_.size(); ++i) {
    bool can_skip_lod = true;
577
    for (auto& input : vec_instruction_[i].InnerRuntimeContext()->inputs) {
578 579 580 581 582 583 584 585 586 587 588 589
      for (auto& var : input.second) {
        if (var->IsType<LoDTensor>()) {
          if (var->Get<LoDTensor>().lod().size() != 0) {
            can_skip_lod = false;
            break;
          }
        } else {
          can_skip_lod = false;
          break;
        }
      }
    }
590
    vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod);
591 592 593
  }
}

594
void InterpreterCore::RunInstruction(const Instruction& instr_node) {
595 596
  auto* op = instr_node.OpBase();
  auto place = instr_node.DeviceContext().GetPlace();
597 598
  Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                           : var_scope_.GetMutableScope();
599 600

#ifdef PADDLE_WITH_ASCEND_CL
601 602 603 604 605 606 607 608 609 610
  if (platform::is_npu_place(place)) {
    auto dev_id = place.device;
    platform::SetNPUDeviceId(dev_id);
    // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
    // values, but only through special `float_status` to checks whether
    // the operation is overflow. More about `float_status`, see:
    // https://gitee.com/ascend/modelzoo/issues/I3NF8V?from=project-issue
    if (FLAGS_check_nan_inf) {
      framework::details::NPUAllocAndClearFloatStatus(*op, *local_scope, place);
    }
611 612 613
  }
#endif

614
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
615
  {
616
    // If it is OperatorBase, InferShape do nothing.
617 618
    if (op_with_kernel != nullptr) {
      platform::RecordEvent infershape_event(
619 620 621
          "infer_shape",
          platform::TracerEventType::OperatorInner,
          1,
622
          platform::EventRole::kInnerOp);
623 624 625 626 627 628 629

      // see OperatorWithKernel::RunImpl in operator.cc for why
      if (!(op_with_kernel->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
            op_with_kernel->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
        op_with_kernel->Info().infer_shape_(
            instr_node.InnerInferShapeContext().get());
      }
630 631 632 633 634
      infershape_event.End();
      platform::RecordOpInfoSupplement(op->Type(),
                                       op->Attrs(),
                                       *(instr_node.InnerInferShapeContext()),
                                       *(instr_node.InnerRuntimeContext()));
635
    }
636
  }
637 638
  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
    // TODO(xiongkun03) Does operator base support inplace ?
639
    for (auto& pair : instr_node.InplaceInfo()) {
640 641 642 643 644 645 646 647
      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
      auto* out =
          paddle::framework::details::GetMutableTensorFromVar(pair.second);
      if (in.dims() == out->dims()) {
        out->ShareBufferWith(in);
      }
    }
  }
648

649
  {
650
    platform::RecordEvent compute_event(
651 652 653
        "compute",
        platform::TracerEventType::OperatorInner,
        1,
654
        platform::EventRole::kInnerOp);
655 656 657
    if (op_with_kernel == nullptr) {
      instr_node.OpBase()->Run(*local_scope, place_);
    } else {
658 659 660
      // fit for phi
      if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) {
        VLOG(4) << "Run phi kernel: " << op->Type();
661 662
        VLOG(4) << instr_node.InnerRuntimeContext().get() << " "
                << &instr_node.DeviceContext();
663
        phi::KernelContext phi_kernel_context;
664
        op_with_kernel->BuildPhiKernelContext(
665
            *instr_node.InnerRuntimeContext().get(),
666
            const_cast<platform::DeviceContext*>(&instr_node.DeviceContext()),
667
            &phi_kernel_context);
668

669
        (*instr_node.PhiKernel())(&phi_kernel_context);
670 671 672 673

      } else {
        instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get());
      }
674
    }
675
  }
676

677
  VLOG(4) << "End run " << place << " " << op->DebugStringEx(local_scope_);
678

679
  if (!instr_node.InplaceBackMap().empty()) {
L
liutiexing 已提交
680 681
    platform::RecordEvent inplaceback_event(
        "InplaceVarsBack", platform::TracerEventType::UserDefined, 10);
682 683 684 685
    auto& m = instr_node.InplaceBackMap();
    // NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
    for (auto& p : m) {
      auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
686
          var_scope_.VarRef(p.first));
687
      auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
688
          var_scope_.VarRef(p.second));
689 690
      original_tensor->ShareDataWith(*transformed_tensor);
      VLOG(4) << "Transfer inplace variable back form "
691 692
              << var_scope_.GetNameById(p.first) << " to "
              << var_scope_.GetNameById(p.second);
693 694 695
    }
  }

696 697 698
  /*For profiling/benchmark only*/
  if (FLAGS_benchmark) {
    instr_node.DeviceContext().Wait();
699 700
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    PADDLE_ENFORCE_GPU_SUCCESS(platform::GpuGetLastError());
701 702 703 704 705
    VLOG(4) << "Operator(" << op->Type()
            << "): context wait and get last error";
#endif
  }

706
  // for debug nan/inf
707
  if (op_with_kernel != nullptr && FLAGS_check_nan_inf) {
708
    VLOG(4) << "Check nan/inf";
709
    framework::details::CheckOpHasNanOrInf(
710
        *op,
711
        *local_scope_,
712
        place);  // TODO(xiongkun03) change it to inner scope.
713
  }
714 715 716
}

void InterpreterCore::ExecuteInstructionList(
717
    const std::vector<Instruction>& vec_instr) {
718 719 720 721 722 723
  unfinished_op_numer_ = vec_instr.size();
  if (unfinished_op_numer_ == 0) {
    VLOG(4) << "No op to run, return";
    return;
  }

L
liutiexing 已提交
724 725
  platform::RecordEvent record_prepare(
      "PrepareAtomic", platform::TracerEventType::UserDefined, 1);
726 727
  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
  // those for the next step
728 729
  auto atomic_deps = atomic_deps_.get();
  auto atomic_var_ref = atomic_var_ref_.get();
730

731 732
  atomic_deps_ = async_work_queue_->PrepareAtomicDeps(dependecy_count_);
  atomic_var_ref_ =
733
      async_work_queue_->PrepareAtomicVarRef(var_scope_.VecMetaInfo());
L
liutiexing 已提交
734
  record_prepare.End();
735

736 737
  exception_holder_.Clear();

738 739
  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
    if (dependecy_count_[i] == 0) {
740
      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
741 742 743
                                 [this,
                                  i,
                                  atomic_deps = atomic_deps.get(),
744
                                  atomic_var_ref = atomic_var_ref.get()] {
745 746
                                   RunInstructionAsync(
                                       i, atomic_deps, atomic_var_ref);
747
                                 });
748 749 750
    }
  }

751
  auto event_name = main_thread_blocker_.WaitEvent();
752 753
  VLOG(1) << "main_thread_blocker_(" << &main_thread_blocker_
          << ") got event_name: " << event_name;
754

755
  if (UNLIKELY(exception_holder_.IsCaught())) {
756
    VLOG(1) << "Exception caught " << exception_holder_.Type();
757 758 759 760 761
    // Graceful exit when the executor encountered a fatal error.
    // EOF is not a fatal error.
    if (exception_holder_.Type() != "EOF") {
      async_work_queue_->Cancel();
    }
762
    VLOG(4) << "Cancel ok";
763
    PADDLE_ENFORCE_EQ(
764 765
        main_thread_blocker_.Clear(),
        0,
766 767
        platform::errors::PreconditionNotMet(
            "main_thread_blocker_.Clear() return -1, clear failed"));
768
    VLOG(4) << "clear ok";
769 770
    exception_holder_.ReThrow();
  }
771
}
772

L
liutiexing 已提交
773
void InterpreterCore::RunNextInstructions(
774 775
    const Instruction& instr,
    std::queue<size_t>* reserved_next_ops,
776 777
    std::vector<std::atomic<size_t>>* atomic_deps,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
778 779
  platform::RecordEvent record(
      "RunNextInstructions", platform::TracerEventType::UserDefined, 10);
L
liutiexing 已提交
780
  VLOG(4) << "atomic 1:" << atomic_deps;
781
  auto& next_instr = instr.NextInstructions();
782 783

  auto IsReady = [atomic_deps](size_t next_id) {
784 785
    VLOG(4) << "atomic:" << atomic_deps << " op_id: " << next_id
            << ", remain deps: " << (*atomic_deps)[next_id];
786
    return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
787 788
  };

789
  if (instr.KernelType() == OpFuncType::kQueueAsync) {
790
    // move all sync_ops into other threads
791
    for (auto next_id : next_instr.SyncRunIds()) {
792
      if (IsReady(next_id)) {
793
        async_work_queue_->AddTask(
794
            vec_instruction_[next_id].KernelType(),
795 796 797
            [this, next_id, atomic_deps, atomic_var_ref]() {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
798 799 800
      }
    }
    // keep all async_ops running in current thread
801
    for (auto next_id : next_instr.DirectRunIds()) {
802
      if (IsReady(next_id)) {
L
liutiexing 已提交
803
        reserved_next_ops->push(next_id);
804 805
      }
    }
806
    for (auto next_id : next_instr.EventRunIds()) {
807
      if (IsReady(next_id)) {
L
liutiexing 已提交
808
        reserved_next_ops->push(next_id);
809 810 811 812
      }
    }
  } else {
    // move async_ops into async_thread
813
    for (auto next_id : next_instr.EventRunIds()) {
814
      if (IsReady(next_id)) {
815
        async_work_queue_->AddTask(
816
            vec_instruction_[next_id].KernelType(),
817 818 819
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
820 821
      }
    }
822 823
    auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
                                                    next_instr.DirectRunIds());
824 825
    size_t first_op = 0;
    for (auto next_id : direct_run_ops) {
826 827
      if (IsReady(next_id)) {
        // only keep one op running in current thread
828 829
        if (first_op == 0) {
          first_op = next_id;
830 831 832
          continue;
        }
        // move rest ops into other threads
833
        async_work_queue_->AddTask(
834
            vec_instruction_[next_id].KernelType(),
835 836 837
            [this, next_id, atomic_deps, atomic_var_ref] {
              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
            });
838 839
      }
    }
L
liutiexing 已提交
840
    if (first_op != 0) reserved_next_ops->push(first_op);
841 842 843
  }
}

844
void InterpreterCore::RunInstructionAsync(
845 846
    size_t instr_id,
    std::vector<std::atomic<size_t>>* atomic_deps,
847
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
L
liutiexing 已提交
848 849 850 851 852
  std::queue<size_t> ready_ops;
  ready_ops.push(instr_id);
  while (!ready_ops.empty()) {
    instr_id = ready_ops.front();
    ready_ops.pop();
853
    auto& instr_node = vec_instruction_.at(instr_id);
L
liutiexing 已提交
854
    VLOG(5) << __func__ << " OP id:" << instr_node.Id()
855 856 857 858
            << " name:" << instr_node.OpBase()->Type() << " type:"
            << (instr_node.KernelType() == OpFuncType::kQueueSync
                    ? "kQueueSync"
                    : "kQueueAsync")
L
liutiexing 已提交
859 860
            << " runs on " << platform::GetCurrentThreadName();

861
    auto* op = instr_node.OpBase();
L
liutiexing 已提交
862 863
    platform::RecordEvent instruction_event(
        op->Type(), platform::TracerEventType::Operator, 1);
864

865
    try {
866 867
      interpreter::WaitEvent(instr_node, place_);

868
      RunInstruction(instr_node);
869

870
      CheckGC(instr_node, atomic_var_ref);
871 872

      interpreter::RecordEvent(instr_node, place_);
873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894
    } catch (platform::EnforceNotMet& ex) {
      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
      exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
    } catch (platform::EOFException&) {
      exception_holder_.Catch(std::current_exception());
    } catch (std::exception& ex) {
      LOG(WARNING) << op->Type() << " raises an exception "
                   << platform::demangle(typeid(ex).name()) << ", "
                   << ex.what();
      exception_holder_.Catch(std::current_exception());
    } catch (...) {
      LOG(WARNING) << op->Type() << " raises an unknown exception";
      exception_holder_.Catch(std::current_exception());
    }

    if (UNLIKELY(exception_holder_.IsCaught())) {
      VLOG(4) << "Exception caught";
      if (exception_notifier_ != nullptr) {
        exception_notifier_->NotifyEvent();
      }
      return;
    }
895

896 897 898 899 900 901 902 903
    VLOG(4) << "unfinished_op_numer_: " << unfinished_op_numer_;
    if (UNLIKELY(unfinished_op_numer_.fetch_sub(1, std::memory_order_relaxed) ==
                 1)) {
      if (completion_notifier_ != nullptr) {
        completion_notifier_->NotifyEvent();
      }
    }

904
    RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
L
liutiexing 已提交
905
  }
906 907
}

908 909 910 911 912 913
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
  if (!IsInterpretercoreFastGCEnabled() ||
      instr.KernelType() != OpFuncType::kQueueAsync) {
    return;
  }
914 915
  platform::RecordEvent record(
      "RecordStreamForGC", platform::TracerEventType::UserDefined, 10);
916

L
Leo Chen 已提交
917 918
  gpuStream_t stream =
      reinterpret_cast<const phi::GPUContext&>(instr.DeviceContext()).stream();
919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963
  auto TensorRecordStream = [&stream](Tensor& tensor) {
    auto allocation = tensor.Holder();
    if (allocation == nullptr) {
      return;
    }

    const platform::Place& place = allocation->place();
    if (platform::is_gpu_place(place)) {
      memory::RecordStream(allocation, stream);
    } else if (platform::is_cuda_pinned_place(place)) {
      // TODO(Ruibiao): Here should do something to make sure that the tensor is
      // not freed until the H2D copies done. However, simplely launch a CUDA
      // runtime callback to the H2D stream may lead a high performance
      // overhead. As all the cases we meet in H2D are copies from CPUPlace at
      // present, we just log a WARNING here. A better design is required.
      LOG(WARNING) << "Copy data from a CUDAPinned tensor in an asynchronous "
                      "manner may lead a data inconsistent";
    } else {
      // memory copies involve CPUPlace are always synchronous, so just do
      // nothing here
    }
  };

  /* NOTE(Ruibiao):Cross-stream tensor synchronization is required only when
   * all the following conditions are satisfied:
   * 1. The tensor will be GC after running the instruction, i.e., in
   * instr.GCCheckVars.
   * 2. The stream which initializes this tensor is different from the stream
   * which the instruction run in.
   * 3. The tensor is the instruction's input, cause we assume that instruction
   * will initialize all output tensors with its running stream.
   * 4. In the OP function of this instruction, the tensor is an input of a
   * async CUDA kernel.
   *
   * Here we only process the first condition, because:
   * 1. Since the RecordStream function will directly return when the recored
   * stream is equal to the owning stream, recording a stream same as which
   * initialized this tensor has less time overhead. Conversely, it may take
   * more time if we try to extract those cross-stream input vars from
   * instr.GCCheckVars.
   * 2. Now the instruction has no idea of which vars involving async running in
   * OP function, and thus we can not recognize condition 4. It should be
   * supported later.
   */
  for (int var_id : instr.GCCheckVars()) {
964 965
    VLOG(4) << "GC sync " << var_scope_.GetNameById(var_id) << " "
            << var_scope_.VarDesc(var_id);
966 967

    // persistable var will be ignore while GC
968 969
    if (var_scope_.VarDesc(var_id) &&
        var_scope_.VarDesc(var_id)->Persistable()) {
970 971 972
      continue;
    }

973
    paddle::framework::Variable* var = var_scope_.VarRef(var_id);
974 975 976 977 978 979 980 981 982 983
    if (var == nullptr) {
      continue;
    }

    if (var->IsType<LoDTensor>()) {
      TensorRecordStream(*(var->GetMutable<LoDTensor>()));
    } else if (var->IsType<
                   operators::reader::
                       OrderedMultiDeviceLoDTensorBlockingQueueHolder>()) {
      // do nothing
984
    } else if (var->IsType<phi::SelectedRows>()) {
985
      TensorRecordStream(
986
          *(var->GetMutable<phi::SelectedRows>()->mutable_value()));
987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
    } else if (var->IsType<LoDTensorArray>()) {
      auto* tensor_arr = var->GetMutable<LoDTensorArray>();
      for (auto& tensor : *tensor_arr) {
        TensorRecordStream(tensor);
      }
    } else if (var->IsType<std::vector<Scope*>>()) {
      // do nothing
    } else {
      PADDLE_THROW(platform::errors::Unimplemented(
          "The variable(%s) is not supported in eager deletion.",
          framework::ToTypeName(var->Type())));
    }
  }
}
#endif

1003 1004 1005
void InterpreterCore::CheckGC(
    const Instruction& instr,
    std::vector<std::atomic<size_t>>* atomic_var_ref) {
1006 1007
  platform::RecordEvent record(
      "CheckGC", platform::TracerEventType::UserDefined, 10);
1008 1009 1010
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  RecordStreamForGC(instr);
#endif
1011
  auto& var_scope = var_scope_;
1012

1013
  for (auto var_id : instr.GCCheckVars()) {
1014
    VLOG(4) << "GC " << var_scope_.GetNameById(var_id) << " "
L
Leo Chen 已提交
1015
            << var_scope.VarDesc(var_id);
1016 1017
    VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
            << " " << var_id;
1018
    bool is_ready =
1019
        (*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
1020 1021 1022 1023 1024
    // ignore all persistable var while GC
    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
      continue;
    }
    if (is_ready) {
X
xiongkun 已提交
1025 1026
      VLOG(6) << "Async delete variable with name : "
              << var_scope.GetNameById(var_id);
1027
      gc_->Add(var_scope_.VarRef(var_id), instr);
W
wanghuancoder 已提交
1028 1029 1030 1031
    }
  }
}

1032 1033
void InterpreterCore::Prepare(
    const std::vector<std::string>& feed_names,
1034 1035 1036 1037
    const std::vector<framework::LoDTensor>& feed_tensors,
    bool prepare_feed) {
  PADDLE_ENFORCE_EQ(feed_names.size(),
                    feed_tensors.size(),
1038 1039 1040
                    platform::errors::PreconditionNotMet(
                        "Required feed_names.size() == feed_tensors.size(), "
                        "but received %d != %d",
1041 1042
                        feed_names.size(),
                        feed_tensors.size()));
1043
  auto FeedInput = [&] {
1044
    VLOG(4) << "Feed inputs";
1045
    for (size_t i = 0; i < feed_names.size(); ++i) {
1046
      auto* feed_var = local_scope_->FindVar(feed_names[i]);
1047
      PADDLE_ENFORCE_NOT_NULL(
1048 1049 1050
          feed_var,
          platform::errors::NotFound("Variable %s should not be nullptr.",
                                     feed_names[i]));
1051

1052
      auto feed_tensor = feed_var->GetMutable<framework::LoDTensor>();
1053
      feed_tensor->ShareDataWith(feed_tensors[i]);
1054
      feed_tensor->set_lod(feed_tensors[i].lod());
1055 1056 1057
    }
  };

1058
  if (!is_build_) {
1059
    paddle::framework::interpreter::build_variable_scope(
1060
        block_, &var_scope_, create_local_scope_);
1061
    FeedInput();
L
Leo Chen 已提交
1062
    std::vector<paddle::framework::OpFuncNode> op_func_nodes;
1063 1064 1065 1066
    paddle::framework::interpreter::build_op_func_list(place_,
                                                       block_,
                                                       skip_gc_vars_,
                                                       &op_func_nodes,
1067
                                                       &var_scope_,
1068 1069
                                                       create_local_scope_,
                                                       used_for_jit_);
1070
    is_build_ = true;
1071
    SetFeedVarsInplaceSkip(feed_names);
1072
    // convert vec func_list to graph
L
Leo Chen 已提交
1073
    Convert(&op_func_nodes);
1074
  }
W
wanghuancoder 已提交
1075 1076
  // NOTE: Because feed_tensor will be GC after
  // paddle::framework::build_op_func_list, so we should
1077
  // call FeedInput again.
1078 1079 1080
  if (prepare_feed) {
    FeedInput();
  }
1081 1082
}

1083 1084 1085
void InterpreterCore::SetFeedVarsInplaceSkip(
    const std::vector<std::string>& feed_names) {
  for (auto& feed_name : feed_names) {
1086
    var_scope_.SetVarSikpInplace(feed_name, true);
1087 1088 1089
  }
}

1090
std::shared_ptr<InterpreterCore> CreateInterpreterCore(
1091 1092
    const platform::Place& place,
    const ProgramDesc& prog,
1093
    Scope* scope,
1094
    const std::vector<std::string>& fetch_names,
1095 1096 1097 1098 1099 1100 1101 1102
    const std::set<std::string>& skip_gc_vars) {
  std::shared_ptr<InterpreterCore> core = nullptr;
  // NOTE(Aurelius84): `add_fetch` will modify BlockDesc, so we should copy
  // a new program.
  auto new_prog = std::make_shared<framework::ProgramDesc>(prog);
  auto* block = new_prog->MutableBlock(0);
  interpreter::add_fetch(fetch_names, block);

1103
  core = std::make_shared<InterpreterCore>(place, *block, skip_gc_vars, scope);
1104 1105 1106 1107
  core->SetCopyProgram(new_prog);
  return core;
}

1108 1109
}  // namespace framework
}  // namespace paddle