naive_executor.cc 9.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

X
Xin Pan 已提交
15
#include "paddle/fluid/framework/naive_executor.h"
16

17
#include <string>
18 19
#include <unordered_map>
#include <unordered_set>
20

21
#include "paddle/fluid/framework/op_registry.h"
22
#include "paddle/fluid/framework/scope.h"
W
Wang Guibao 已提交
23
#include "paddle/fluid/framework/variable_helper.h"
24
#include "paddle/fluid/platform/denormal.h"
25 26 27
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
28
#ifdef PADDLE_WITH_TENSORRT
W
wenbin 已提交
29 30
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
#endif
31 32 33
#ifdef PADDLE_WITH_INFERENCE_NVTX
#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
#endif
34 35 36

namespace paddle {
namespace framework {
37 38 39 40
void NaiveExecutor::Prepare(Scope *scope,
                            const ProgramDesc &program_desc,
                            int block_id,
                            bool with_feed_fetch_ops) {
41
  if (!scope) {
42 43
    scope_ = new framework::Scope;
  } else {
44
    scope_ = scope;
45
  }
46 47

  VLOG(3) << "NaiveExecutor init with scope " << scope;
48 49 50 51
  CreateOps(program_desc, block_id, with_feed_fetch_ops);
}

void NaiveExecutor::Run() {
52 53
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
J
Jacek Czaja 已提交
54
  platform::RegisterModelLayout(ops_, place_);
55
#endif
56
  platform::ScopedFlushDenormal flush;
57 58 59
#ifdef PADDLE_WITH_INFERENCE_NVTX
  platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow);
#endif
60
  for (auto &op : ops_) {
Y
Yan Chunwei 已提交
61 62
    VLOG(4) << std::this_thread::get_id() << " run "
            << op->DebugStringEx(scope_) << " on scope " << scope_;
63
    op->SetIsCalledByExecutor(false);
64
#ifdef PADDLE_WITH_INFERENCE_NVTX
65 66
    platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(),
                                platform::NvtxRangeColor::Green);
67
#endif
68

69 70 71 72
    if (op->Type() == "while") {
      op->SetOutputHooks(hookfuncs_);
    }

73
    op->Run(*scope_, place_);
74 75 76 77 78 79 80

    // Update the shared_holder so that only records the max one.
    if (reuse_cache_.count(op.get())) {
      for (auto &it : reuse_cache_[op.get()]) {
        if (it.first->memory_size() >
            cluster_buffer_[it.second]->memory_size()) {
          cluster_buffer_[it.second] = it.first;
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96
          int updated_cluster_id = it.second;

          // cluster_buffer_[it.second] has been updated to be a new
          // phi::DenseTensor*, we need change all phi::DenseTensor's
          // shared_holder in this cluster. The following two loops code looks
          // ugly, it does work. The following two loops seem time-consuming,
          // but once the memory reaches its peak, the cluster will not update,
          // so it's ok.
          for (auto &op_map : reuse_cache_) {
            // op_map.second is std::unordered_map<phi::DenseTensor*, int>.
            for (auto &it2 : op_map.second) {
              if (it2.second == updated_cluster_id) {
                it2.first->ShareBufferWith(*cluster_buffer_[it2.second], true);
              }
            }
          }
97 98 99 100
        }
      }
    }

101 102 103
#ifdef PADDLE_WITH_INFERENCE_NVTX
    platform::CudaNvtxRangePop();
#endif
104 105
    for (auto &func : hookfuncs_) {
      func(op.get(), scope_);
106
    }
107
  }
108 109 110
#ifdef PADDLE_WITH_INFERENCE_NVTX
  platform::CudaNvtxRangePop();
#endif
111 112
}

113 114 115 116
void NaiveExecutor::CreateVariables(const ProgramDesc &desc,
                                    int block_id,
                                    bool persistable,
                                    Scope *scope) {
117 118 119
  PADDLE_ENFORCE_NOT_NULL(scope,
                          platform::errors::InvalidArgument(
                              "The Scope to hold variables is nullptr."));
120

121 122
  auto &global_block = desc.Block(block_id);

123
  const auto *anc = scope;
124
  PADDLE_ENFORCE_NE(
125 126
      anc->parent(),
      anc,
127
      platform::errors::InvalidArgument("Input scope should be child scope."));
128 129
  while (anc->parent()) {
    anc = anc->parent();
130 131
  }

Y
Yan Chunwei 已提交
132
  int num_vars = 0;
133 134 135 136
  for (auto &var : global_block.AllVars()) {
    if (var->Name() == framework::kEmptyVarName) {
      continue;
    }
Y
Yan Chunwei 已提交
137
    num_vars++;
138 139 140 141 142 143 144 145 146 147 148 149 150

    if (persistable == var->Persistable()) {
      if (persistable) {
        if (!anc->FindVar(var->Name())) {
          auto *ptr = const_cast<Scope *>(anc)->Var(var->Name());
          VLOG(3) << scope << " Create persistable variable " << var->Name()
                  << ", which pointer is " << ptr;
          InitializeVariable(ptr, var->GetType());
        }
      } else {
        auto *ptr = const_cast<Scope *>(scope)->Var(var->Name());
        VLOG(3) << scope << " Create variable " << var->Name()
                << ", which pointer is " << ptr;
151 152 153 154
        InitializeVariable(ptr, var->GetType());
      }
    }
  }
Y
Yan Chunwei 已提交
155
  VLOG(4) << "naive executor create " << num_vars << " vars";
156 157
}

158 159
void NaiveExecutor::CreateOps(const ProgramDesc &desc,
                              int block_id,
160 161 162 163
                              bool with_feed_fetch_ops) {
  for (const auto &op_desc : desc.Block(block_id).AllOps()) {
    if (!with_feed_fetch_ops &&
        (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
164 165
      LOG(INFO) << "---  skip [" << op_desc->Input("X")[0] << "], "
                << op_desc->Type() << " -> " << op_desc->Output("Out")[0];
166 167 168 169 170 171
      continue;
    }
    ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
  }
}

172
phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
173 174 175
  PADDLE_ENFORCE_NOT_NULL(scope_,
                          platform::errors::PreconditionNotMet(
                              "Need to init scope in NaiveExecutor firstly."));
176
  auto *var = scope_->FindVar(name);
177 178 179
  PADDLE_ENFORCE_NOT_NULL(
      var,
      platform::errors::NotFound("No variable [%s] in current scope.", name));
180
  auto *tensor = const_cast<phi::DenseTensor *>(&var->Get<phi::DenseTensor>());
181 182 183
  return tensor;
}

184
void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
185
  hookfuncs_.push_back(hookfunc);
186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
}

void NaiveExecutor::MakeReusePlan(
    const std::unordered_map<std::string, std::string> &reuse_table) {
  std::unordered_map<std::string, std::unordered_set<std::string>> clusters;
  for (auto &it : reuse_table) {
    clusters[it.second].insert(it.first);
  }

  std::vector<std::string> cluster_names;
  for (auto &it : clusters) {
    cluster_names.push_back(it.first);
  }
  cluster_buffer_.resize(cluster_names.size());

  for (auto &op : ops_) {
    for (auto &name : op->OutputVars(true)) {
      if (reuse_table.count(name)) {
        const auto &reuse_name = reuse_table.at(name);
        auto it =
            std::find(cluster_names.begin(), cluster_names.end(), reuse_name);
        int idx = it - cluster_names.begin();
        auto *var = scope_->FindVar(name);
        auto *reuse_var = scope_->FindVar(reuse_name);
        if (var && reuse_var && var->IsType<phi::DenseTensor>() &&
            reuse_var->IsType<phi::DenseTensor>()) {
          auto *tensor = var->GetMutable<phi::DenseTensor>();
          auto *reuse_tensor = reuse_var->GetMutable<phi::DenseTensor>();
          cluster_buffer_[idx] = reuse_tensor;
          if (reuse_cache_.count(op.get())) {
            reuse_cache_[op.get()].emplace(tensor, idx);
          } else {
            reuse_cache_[op.get()] =
                std::unordered_map<phi::DenseTensor *, int>{{tensor, idx}};
          }
        }
      }
    }
  }
225 226
}

227 228 229 230
NaiveExecutor::~NaiveExecutor() {
#ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
231
  platform::ClearMKLDNNCache(place_, this);
232 233 234
#endif
}

W
wenbin 已提交
235
void NaiveExecutor::ResetTrtOps(int num) {
236
#ifdef PADDLE_WITH_TENSORRT
W
wenbin 已提交
237 238 239 240 241 242 243 244 245
  for (auto &op : ops_) {
    if (op->Type() == "tensorrt_engine") {
      operators::TensorRTEngineOp *trtop =
          dynamic_cast<operators::TensorRTEngineOp *>(op.get());
      if (!trtop) return;
      std::string engine_key = trtop->Attr<std::string>("engine_key");
      int engine_predictor_id = trtop->Attr<int>("predictor_id");
      std::string engine_name =
          engine_key + std::to_string(engine_predictor_id);
W
wenbin 已提交
246 247 248
      operators::TensorRTEngine *trt_engine = nullptr;
      // can't get trt engine if int8 calibration table data process.
      if (paddle::inference::Singleton<
W
wenbin 已提交
249
              inference::tensorrt::TRTEngineManager>::Global()
W
wenbin 已提交
250 251 252 253 254 255
              .Has(engine_name)) {
        trt_engine = paddle::inference::Singleton<
                         inference::tensorrt::TRTEngineManager>::Global()
                         .Get(engine_name);
      }
      if (trt_engine && trt_engine->with_dynamic_shape()) {
W
wenbin 已提交
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
        LOG(INFO) << "rebuild trt engine, this may cost a lot of time!";
        trt_engine->ResetContext();
        trt_engine->ClearTensorMap();
        trt_engine->SetProfileNum(num);
        auto *anc = scope_->parent();
        while (anc && anc->parent()) {
          anc = anc->parent();
        }
        if (anc == nullptr) {
          anc = scope_;
        }
        trtop->PrepareTRTEngine(*anc, trt_engine);
      }
    }
  }
#endif
}
273

274 275
}  // namespace framework
}  // namespace paddle