tensorrt_engine_op.h 13.6 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */

#pragma once

#ifdef PADDLE_WITH_CUDA

N
nhzlx 已提交
19
#include <memory>
G
gongweibao 已提交
20
#include <string>
N
nhzlx 已提交
21
#include <unordered_map>
N
nhzlx 已提交
22
#include <unordered_set>
23
#include <utility>
G
gongweibao 已提交
24 25
#include <vector>

N
nhzlx 已提交
26
#include "paddle/fluid/framework/executor.h"
N
nhzlx 已提交
27
#include "paddle/fluid/framework/op_registry.h"
28 29
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h"
N
nhzlx 已提交
30
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
31
#include "paddle/fluid/inference/tensorrt/engine.h"
32
#include "paddle/fluid/inference/tensorrt/helper.h"
33 34

namespace paddle {
35

36 37
namespace operators {

Y
Yan Chunwei 已提交
38
using inference::Singleton;
N
nhzlx 已提交
39
using inference::tensorrt::TensorRTEngine;
N
nhzlx 已提交
40
using inference::tensorrt::TRTInt8Calibrator;
N
nhzlx 已提交
41 42
using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager;
N
nhzlx 已提交
43

44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
static void RuntimeStaticShapeCheck(std::vector<int64_t> runtime_input_shape,
                                    std::vector<int64_t> model_input_shape) {
  auto comma_fold = [](std::string a, int b) {
    return std::move(a) + ", " + std::to_string(b);
  };
  std::string model_input_shape_str = std::accumulate(
      std::next(model_input_shape.begin()), model_input_shape.end(),
      std::to_string(model_input_shape[0]), comma_fold);
  std::string runtime_input_shape_str = std::accumulate(
      std::next(runtime_input_shape.begin()), runtime_input_shape.end(),
      std::to_string(runtime_input_shape[0]), comma_fold);
  PADDLE_ENFORCE_EQ(
      model_input_shape == runtime_input_shape, true,
      platform::errors::InvalidArgument(
          "Input shapes are inconsistent with the model. Expect [%s] in "
          "model description, but got [%s] in runtime. TRT 5 "
          "or lower version "
          "does not support dynamic input shapes. Please check and "
          "modify "
          "your input shapes.",
          model_input_shape_str, runtime_input_shape_str));
}

N
nhzlx 已提交
67 68 69 70
class TensorRTEngineOp : public framework::OperatorBase {
 private:
  std::vector<std::string> input_names_;
  std::unordered_set<std::string> param_names_;
71
  mutable TensorRTEngine *trt_engine_{nullptr};
N
nhzlx 已提交
72 73
  int max_batch_size_;
  int workspace_size_;
N
nhzlx 已提交
74
  std::unique_ptr<TRTInt8Calibrator> calibrator_;
N
nhzlx 已提交
75
  bool enable_int8_;
Z
Zhaolong Xing 已提交
76
  bool enable_fp16_;
77
  bool use_calib_mode_;
N
nhzlx 已提交
78 79 80
  std::string calibration_data_;
  std::string engine_key_;
  bool calibration_mode_;
81
  int predictor_id_;
N
nhzlx 已提交
82
  int device_id_;
Z
Zhaolong Xing 已提交
83
  AnalysisConfig::Precision precision_mode_;
Y
Yan Chunwei 已提交
84

85
 public:
N
nhzlx 已提交
86 87 88 89 90 91 92 93
  TensorRTEngineOp(const std::string &type,
                   const framework::VariableNameMap &inputs,
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
      : framework::OperatorBase(type, inputs, outputs, attrs) {
    input_names_ = Inputs("Xs");
    max_batch_size_ = Attr<int>("max_batch_size");
    workspace_size_ = Attr<int>("workspace_size");
N
nhzlx 已提交
94
    device_id_ = Attr<int>("gpu_id");
N
nhzlx 已提交
95
    enable_int8_ = Attr<bool>("enable_int8");
Z
Zhaolong Xing 已提交
96
    enable_fp16_ = Attr<bool>("enable_fp16");
97
    use_calib_mode_ = Attr<bool>("use_calib_mode");
N
nhzlx 已提交
98 99
    calibration_data_ = Attr<std::string>("calibration_data");
    engine_key_ = Attr<std::string>("engine_key");
100
    predictor_id_ = Attr<int>("predictor_id");
N
nhzlx 已提交
101 102 103 104 105

    auto params = Attr<std::vector<std::string>>("parameters");
    for (const auto &param : params) {
      param_names_.insert(param);
    }
N
nhzlx 已提交
106 107
    // calibration_mode is ture represents we need to
    // generate the calibration table data.
108 109
    calibration_mode_ =
        (enable_int8_ && calibration_data_.size() == 0 && use_calib_mode_);
N
nhzlx 已提交
110

N
nhzlx 已提交
111 112
    VLOG(4) << "calibration_mode: " << calibration_mode_;
    if (enable_int8_ && calibration_data_.size()) {
N
nhzlx 已提交
113 114
      calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
    }
115 116 117 118 119 120 121 122
    bool has_engine =
        inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
            .Has(engine_key_ + std::to_string(predictor_id_));

    if (!calibration_mode_ && has_engine) {
      trt_engine_ =
          inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
              .Get(engine_key_ + std::to_string(predictor_id_));
N
nhzlx 已提交
123
    }
Z
Zhaolong Xing 已提交
124 125 126 127 128 129 130
    precision_mode_ = AnalysisConfig::Precision::kFloat32;
    if (enable_int8_) {
      precision_mode_ = AnalysisConfig::Precision::kInt8;
    }
    if (enable_fp16_) {
      precision_mode_ = AnalysisConfig::Precision::kHalf;
    }
N
nhzlx 已提交
131
  }
132 133

 protected:
N
nhzlx 已提交
134 135
  void RunNativeImpl(const framework::Scope &scope,
                     const platform::Place &dev_place) const {
N
nhzlx 已提交
136 137 138
    framework::Executor executor(dev_place);
    auto *block = Attr<framework::BlockDesc *>("sub_block");
    auto *program = block->Program();
N
nhzlx 已提交
139
    auto &current_scope = scope.NewScope();
N
nhzlx 已提交
140
    auto ctx = executor.Prepare(*program, block->ID());
N
nhzlx 已提交
141
    executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
N
nhzlx 已提交
142 143
  }

N
nhzlx 已提交
144 145
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
N
nhzlx 已提交
146 147 148 149
    if (calibration_mode_ == true) {
      RunCalibration(scope, dev_place);
      return;
    }
N
nhzlx 已提交
150
    auto *trt_engine = GetEngine(scope, dev_place);
N
nhzlx 已提交
151
    RunTrt(scope, dev_place, trt_engine);
152 153
  }

N
nhzlx 已提交
154 155
  void RunCalibration(const framework::Scope &scope,
                      const platform::Place &dev_place) const {
N
nhzlx 已提交
156 157 158
    // This process will builds a 32-bit trt engine, runs it on the calibration
    // set, and records a histogram for each
    // tensor of the distribution of activation values.
159 160 161
    LOG_FIRST_N(INFO, 1) << "This process is generating calibration table for "
                            "Paddle TRT int8...";

N
nhzlx 已提交
162
    int runtime_batch = 1;
N
nhzlx 已提交
163 164 165
    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
      TRTCalibratorEngine *calib_res =
          Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
N
nhzlx 已提交
166 167 168 169 170 171 172 173 174 175 176 177
      std::unordered_map<std::string, size_t> calib_buffers;
      for (auto &x : input_names_) {
        if (param_names_.count(x)) continue;
        auto &t =
            inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
        calib_buffers[x] = t.memory_size();
        auto t_shape = framework::vectorize(t.dims());
        runtime_batch = t_shape[0];
      }
      calib_res->calib_.reset(new TRTInt8Calibrator(
          calib_buffers, runtime_batch, engine_key_, dev_place));
      calib_res->thr_.reset(new std::thread([&]() {
N
nhzlx 已提交
178
        calib_res->engine_.reset(new TensorRTEngine(
Z
Zhaolong Xing 已提交
179
            max_batch_size_, workspace_size_, precision_mode_,
N
nhzlx 已提交
180 181
            calib_res->calib_.get(),
            boost::get<platform::CUDAPlace>(dev_place).device));
N
nhzlx 已提交
182
        VLOG(3) << "start the calib trt engine thread";
183
        PrepareTRTEngine(scope, calib_res->engine_.get());
N
nhzlx 已提交
184 185 186 187
      }));
    }

    TRTInt8Calibrator *temp_calibrator =
N
nhzlx 已提交
188
        Singleton<TRTCalibratorEngineManager>::Global()
N
nhzlx 已提交
189 190 191 192 193 194 195 196 197 198 199
            .Get(engine_key_)
            ->calib_.get();
    std::unordered_map<std::string, void *> calib_data;

    for (auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
      auto &t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
      calib_data.emplace(x, t.data<void>());
    }
    temp_calibrator->setBatch(calib_data);
N
nhzlx 已提交
200
    RunNativeImpl(scope, dev_place);
N
nhzlx 已提交
201 202
  }

N
nhzlx 已提交
203 204
  void RunTrt(const framework::Scope &scope, const platform::Place &dev_place,
              TensorRTEngine *engine) const {
N
nhzlx 已提交
205
    int runtime_batch = 1;
N
nhzlx 已提交
206 207 208 209
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(dev_place);
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
N
nhzlx 已提交
210

211 212
    PADDLE_ENFORCE_EQ(input_names_.empty(), false,
                      "should pass at least one input");
213

214
    std::vector<std::string> output_maps =
N
nhzlx 已提交
215
        Attr<std::vector<std::string>>("output_name_mapping");
216

N
nhzlx 已提交
217 218 219 220 221 222 223 224 225 226
    int num_inputs = 0;

    for (const auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
      num_inputs += 1;
    }
    const int num_bindings = num_inputs + Outputs("Ys").size();
    std::vector<void *> buffers(num_bindings);

    // Bind input tensor to TRT.
N
nhzlx 已提交
227 228
    for (const auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
229
      // convert input and copy to TRT engine's buffer
N
nhzlx 已提交
230 231
      auto &t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
232
      auto t_shape = framework::vectorize<int64_t>(t.dims());
N
nhzlx 已提交
233
      runtime_batch = t_shape[0];
N
nhzlx 已提交
234 235 236
      const int bind_index = engine->engine()->getBindingIndex(x.c_str());
      PADDLE_ENFORCE(bind_index < num_bindings,
                     "The bind index should be less than num_bindings");
237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
      if (!engine->with_dynamic_shape()) {
        // check if the input shapes are consistent with model.
        if (HasAttr(x + "_shape")) {
          std::vector<int64_t> i_shape =
              Attr<std::vector<int64_t>>(x + "_shape");
          std::vector<int64_t> model_input_shape(i_shape.begin() + 1,
                                                 i_shape.end());
          std::vector<int64_t> runtime_input_shape(t_shape.begin() + 1,
                                                   t_shape.end());
          RuntimeStaticShapeCheck(runtime_input_shape, model_input_shape);
        }
      } else {
#if IS_TRT_VERSION_GE(6000)
        auto *trt_context = engine->context();
        trt_context->setBindingDimensions(
            bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
#endif
      }
N
nhzlx 已提交
255 256
      buffers[bind_index] = static_cast<void *>(t.data<float>());
    }
257

N
nhzlx 已提交
258
    // Bind output tensor to TRT.
259
    int output_index = 0;
M
minqiyang 已提交
260
    VLOG(4) << "TensorRT Engine Op Outputs:";
N
nhzlx 已提交
261
    for (const auto &y : Outputs("Ys")) {
N
nhzlx 已提交
262 263
      const int bind_index =
          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
264
      std::vector<int> ddim;
265 266 267 268 269 270 271 272 273 274 275 276 277

      if (!engine->with_dynamic_shape()) {
        auto dims = engine->engine()->getBindingDimensions(bind_index);
        ddim.push_back(runtime_batch);
        for (int i = 0; i < dims.nbDims; i++) {
          ddim.push_back(dims.d[i]);
        }
      } else {
#if IS_TRT_VERSION_GE(6000)
        auto *trt_context = engine->context();
        auto dims = trt_context->getBindingDimensions(bind_index);
        for (int i = 0; i < dims.nbDims; i++) ddim.push_back(dims.d[i]);
#endif
278
      }
N
nhzlx 已提交
279
      auto *fluid_v = scope.FindVar(y);
280
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
N
nhzlx 已提交
281
      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
282
      fluid_t->Resize(framework::make_ddim(ddim));
283

N
nhzlx 已提交
284 285 286 287 288
      PADDLE_ENFORCE(bind_index < num_bindings,
                     "The bind index should be less than num_bindings");
      buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
          boost::get<platform::CUDAPlace>(dev_place)));

289
      output_index += 1;
290
    }
291

292 293 294 295 296 297 298
    PADDLE_ENFORCE_LE(
        runtime_batch, max_batch_size_,
        platform::errors::InvalidArgument(
            "The runtime batch size (%d) is greater than the max batch "
            "size(%d).\n"
            "There are two possible causes for this problem: \n"
            "1. Check whether the runtime batch is larger than the max_batch "
T
tianshuo78520a 已提交
299
            "set by EnableTensorrtEngine()\n"
300 301 302 303 304 305 306 307 308 309
            "2. Check whether the model you are running has multiple trt "
            "subgraphs: \n "
            "\tIf there are multiple trt subgraphs, you need to ensure that "
            "the first dimension of the input tensor of these subgraphs is "
            "consistent.\n"
            "\tIf there are inconsistent subgraphs, you need to filter them by "
            "setting min_subgraph_size using EnableTensorrtEngine interface.\n"
            "\tThe min_subgraph_size shouble to be greater than the number of "
            "nodes in the inconsistent subgraph.\n",
            runtime_batch, max_batch_size_));
N
nhzlx 已提交
310
    // Execute the engine.
311
    engine->Execute(runtime_batch, &buffers, stream);
312 313
  }

N
nhzlx 已提交
314 315
  TensorRTEngine *GetEngine(const framework::Scope &scope,
                            const platform::Place &dev_place) const {
N
nhzlx 已提交
316
    if (!trt_engine_) {
317 318 319
      trt_engine_ =
          inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
              .Create(engine_key_ + std::to_string(predictor_id_),
Z
Zhaolong Xing 已提交
320
                      max_batch_size_, workspace_size_, precision_mode_,
321 322
                      calibrator_.get(), device_id_);
      PrepareTRTEngine(scope, trt_engine_);
N
nhzlx 已提交
323
    }
324
    return trt_engine_;
N
nhzlx 已提交
325 326
  }

327 328
  void PrepareTRTEngine(const framework::Scope &scope,
                        TensorRTEngine *engine) const {
N
nhzlx 已提交
329 330
    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
                 "kernel etc). This process may cost a lot of time.";
331 332 333
    framework::proto::BlockDesc block_proto;
    block_proto.ParseFromString(Attr<std::string>("subgraph"));
    framework::BlockDesc block_desc(nullptr, &block_proto);
N
nhzlx 已提交
334

335 336
    std::vector<std::string> inputs = Inputs("Xs");
    std::vector<std::string> outputs =
337 338
        Attr<std::vector<std::string>>("output_name_mapping");

N
nhzlx 已提交
339
    inference::Singleton<inference::tensorrt::OpConverter>::Global()
340 341
        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
                                 outputs, engine);
N
nhzlx 已提交
342
  }
343 344 345 346 347 348
};

}  // namespace operators
}  // namespace paddle

#endif  // PADDLE_WITH_CUDA