naive_executor.cc 9.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

X
Xin Pan 已提交
15
#include "paddle/fluid/framework/naive_executor.h"
16

17
#include <string>
18 19
#include <unordered_map>
#include <unordered_set>
20

21
#include "paddle/fluid/framework/op_registry.h"
22
#include "paddle/fluid/framework/scope.h"
W
Wang Guibao 已提交
23
#include "paddle/fluid/framework/variable_helper.h"
24
#include "paddle/fluid/platform/denormal.h"
25 26 27
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
28
#ifdef PADDLE_WITH_TENSORRT
W
wenbin 已提交
29 30
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
#endif
31 32 33
#ifdef PADDLE_WITH_INFERENCE_NVTX
#include "paddle/fluid/platform/device/gpu/cuda/cuda_profiler.h"
#endif
34 35 36

namespace paddle {
namespace framework {
37 38 39 40
void NaiveExecutor::Prepare(Scope *scope,
                            const ProgramDesc &program_desc,
                            int block_id,
                            bool with_feed_fetch_ops) {
41
  if (!scope) {
42 43
    scope_ = new framework::Scope;
  } else {
44
    scope_ = scope;
45
  }
46 47

  VLOG(3) << "NaiveExecutor init with scope " << scope;
48 49 50 51
  CreateOps(program_desc, block_id, with_feed_fetch_ops);
}

void NaiveExecutor::Run() {
52 53
#ifdef PADDLE_WITH_MKLDNN
  platform::AttachPointerHashToMKLDNNKey(this, place_);
J
Jacek Czaja 已提交
54
  platform::RegisterModelLayout(ops_, place_);
55
#endif
56
  platform::ScopedFlushDenormal flush;
57 58 59
#ifdef PADDLE_WITH_INFERENCE_NVTX
  platform::CudaNvtxRangePush("model", platform::NvtxRangeColor::Yellow);
#endif
60
  for (auto &op : ops_) {
Y
Yan Chunwei 已提交
61 62
    VLOG(4) << std::this_thread::get_id() << " run "
            << op->DebugStringEx(scope_) << " on scope " << scope_;
63
    op->SetIsCalledByExecutor(false);
64
#ifdef PADDLE_WITH_INFERENCE_NVTX
65 66
    platform::CudaNvtxRangePush(op->Type() + "|" + op->OutputVars(true).front(),
                                platform::NvtxRangeColor::Green);
67
#endif
68

69
    op->Run(*scope_, place_);
70 71 72 73 74 75 76

    // Update the shared_holder so that only records the max one.
    if (reuse_cache_.count(op.get())) {
      for (auto &it : reuse_cache_[op.get()]) {
        if (it.first->memory_size() >
            cluster_buffer_[it.second]->memory_size()) {
          cluster_buffer_[it.second] = it.first;
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92
          int updated_cluster_id = it.second;

          // cluster_buffer_[it.second] has been updated to be a new
          // phi::DenseTensor*, we need change all phi::DenseTensor's
          // shared_holder in this cluster. The following two loops code looks
          // ugly, it does work. The following two loops seem time-consuming,
          // but once the memory reaches its peak, the cluster will not update,
          // so it's ok.
          for (auto &op_map : reuse_cache_) {
            // op_map.second is std::unordered_map<phi::DenseTensor*, int>.
            for (auto &it2 : op_map.second) {
              if (it2.second == updated_cluster_id) {
                it2.first->ShareBufferWith(*cluster_buffer_[it2.second], true);
              }
            }
          }
93 94 95 96
        }
      }
    }

97 98 99
#ifdef PADDLE_WITH_INFERENCE_NVTX
    platform::CudaNvtxRangePop();
#endif
100 101
    for (auto &func : hookfunc_) {
      func(op.get());
102
    }
103
  }
104 105 106
#ifdef PADDLE_WITH_INFERENCE_NVTX
  platform::CudaNvtxRangePop();
#endif
107 108
}

109 110 111 112
void NaiveExecutor::CreateVariables(const ProgramDesc &desc,
                                    int block_id,
                                    bool persistable,
                                    Scope *scope) {
113 114 115
  PADDLE_ENFORCE_NOT_NULL(scope,
                          platform::errors::InvalidArgument(
                              "The Scope to hold variables is nullptr."));
116

117 118
  auto &global_block = desc.Block(block_id);

119
  const auto *anc = scope;
120
  PADDLE_ENFORCE_NE(
121 122
      anc->parent(),
      anc,
123
      platform::errors::InvalidArgument("Input scope should be child scope."));
124 125
  while (anc->parent()) {
    anc = anc->parent();
126 127
  }

Y
Yan Chunwei 已提交
128
  int num_vars = 0;
129 130 131 132
  for (auto &var : global_block.AllVars()) {
    if (var->Name() == framework::kEmptyVarName) {
      continue;
    }
Y
Yan Chunwei 已提交
133
    num_vars++;
134 135 136 137 138 139 140 141 142 143 144 145 146

    if (persistable == var->Persistable()) {
      if (persistable) {
        if (!anc->FindVar(var->Name())) {
          auto *ptr = const_cast<Scope *>(anc)->Var(var->Name());
          VLOG(3) << scope << " Create persistable variable " << var->Name()
                  << ", which pointer is " << ptr;
          InitializeVariable(ptr, var->GetType());
        }
      } else {
        auto *ptr = const_cast<Scope *>(scope)->Var(var->Name());
        VLOG(3) << scope << " Create variable " << var->Name()
                << ", which pointer is " << ptr;
147 148 149 150
        InitializeVariable(ptr, var->GetType());
      }
    }
  }
Y
Yan Chunwei 已提交
151
  VLOG(4) << "naive executor create " << num_vars << " vars";
152 153
}

154 155
void NaiveExecutor::CreateOps(const ProgramDesc &desc,
                              int block_id,
156 157 158 159
                              bool with_feed_fetch_ops) {
  for (const auto &op_desc : desc.Block(block_id).AllOps()) {
    if (!with_feed_fetch_ops &&
        (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
160 161
      LOG(INFO) << "---  skip [" << op_desc->Input("X")[0] << "], "
                << op_desc->Type() << " -> " << op_desc->Output("Out")[0];
162 163 164 165 166 167
      continue;
    }
    ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
  }
}

168
phi::DenseTensor *NaiveExecutor::FindTensor(const std::string &name) {
169 170 171
  PADDLE_ENFORCE_NOT_NULL(scope_,
                          platform::errors::PreconditionNotMet(
                              "Need to init scope in NaiveExecutor firstly."));
172
  auto *var = scope_->FindVar(name);
173 174 175
  PADDLE_ENFORCE_NOT_NULL(
      var,
      platform::errors::NotFound("No variable [%s] in current scope.", name));
176
  auto *tensor = const_cast<phi::DenseTensor *>(&var->Get<phi::DenseTensor>());
177 178 179
  return tensor;
}

180
void NaiveExecutor::RegisterOutputHook(const HookFunc &hookfunc) {
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220
  hookfunc_.push_back(hookfunc);
}

void NaiveExecutor::MakeReusePlan(
    const std::unordered_map<std::string, std::string> &reuse_table) {
  std::unordered_map<std::string, std::unordered_set<std::string>> clusters;
  for (auto &it : reuse_table) {
    clusters[it.second].insert(it.first);
  }

  std::vector<std::string> cluster_names;
  for (auto &it : clusters) {
    cluster_names.push_back(it.first);
  }
  cluster_buffer_.resize(cluster_names.size());

  for (auto &op : ops_) {
    for (auto &name : op->OutputVars(true)) {
      if (reuse_table.count(name)) {
        const auto &reuse_name = reuse_table.at(name);
        auto it =
            std::find(cluster_names.begin(), cluster_names.end(), reuse_name);
        int idx = it - cluster_names.begin();
        auto *var = scope_->FindVar(name);
        auto *reuse_var = scope_->FindVar(reuse_name);
        if (var && reuse_var && var->IsType<phi::DenseTensor>() &&
            reuse_var->IsType<phi::DenseTensor>()) {
          auto *tensor = var->GetMutable<phi::DenseTensor>();
          auto *reuse_tensor = reuse_var->GetMutable<phi::DenseTensor>();
          cluster_buffer_[idx] = reuse_tensor;
          if (reuse_cache_.count(op.get())) {
            reuse_cache_[op.get()].emplace(tensor, idx);
          } else {
            reuse_cache_[op.get()] =
                std::unordered_map<phi::DenseTensor *, int>{{tensor, idx}};
          }
        }
      }
    }
  }
221 222
}

223 224 225 226
NaiveExecutor::~NaiveExecutor() {
#ifdef PADDLE_WITH_MKLDNN
  // Clear mkl-dnn cache,
  // this is needed to have mkl-dnn unit tests working
227
  platform::ClearMKLDNNCache(place_, this);
228 229 230
#endif
}

W
wenbin 已提交
231
void NaiveExecutor::ResetTrtOps(int num) {
232
#ifdef PADDLE_WITH_TENSORRT
W
wenbin 已提交
233 234 235 236 237 238 239 240 241
  for (auto &op : ops_) {
    if (op->Type() == "tensorrt_engine") {
      operators::TensorRTEngineOp *trtop =
          dynamic_cast<operators::TensorRTEngineOp *>(op.get());
      if (!trtop) return;
      std::string engine_key = trtop->Attr<std::string>("engine_key");
      int engine_predictor_id = trtop->Attr<int>("predictor_id");
      std::string engine_name =
          engine_key + std::to_string(engine_predictor_id);
W
wenbin 已提交
242 243 244
      operators::TensorRTEngine *trt_engine = nullptr;
      // can't get trt engine if int8 calibration table data process.
      if (paddle::inference::Singleton<
W
wenbin 已提交
245
              inference::tensorrt::TRTEngineManager>::Global()
W
wenbin 已提交
246 247 248 249 250 251
              .Has(engine_name)) {
        trt_engine = paddle::inference::Singleton<
                         inference::tensorrt::TRTEngineManager>::Global()
                         .Get(engine_name);
      }
      if (trt_engine && trt_engine->with_dynamic_shape()) {
W
wenbin 已提交
252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268
        LOG(INFO) << "rebuild trt engine, this may cost a lot of time!";
        trt_engine->ResetContext();
        trt_engine->ClearTensorMap();
        trt_engine->SetProfileNum(num);
        auto *anc = scope_->parent();
        while (anc && anc->parent()) {
          anc = anc->parent();
        }
        if (anc == nullptr) {
          anc = scope_;
        }
        trtop->PrepareTRTEngine(*anc, trt_engine);
      }
    }
  }
#endif
}
269

270 271
}  // namespace framework
}  // namespace paddle