tensorrt_engine_op.h 9.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */

#pragma once

#ifdef PADDLE_WITH_CUDA

N
nhzlx 已提交
19
#include <memory>
G
gongweibao 已提交
20
#include <string>
N
nhzlx 已提交
21
#include <unordered_map>
N
nhzlx 已提交
22
#include <unordered_set>
G
gongweibao 已提交
23 24
#include <vector>

N
nhzlx 已提交
25
#include "paddle/fluid/framework/executor.h"
N
nhzlx 已提交
26
#include "paddle/fluid/framework/op_registry.h"
27 28
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/inference/analysis/helper.h"
N
nhzlx 已提交
29
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
30 31 32
#include "paddle/fluid/inference/tensorrt/engine.h"

namespace paddle {
33

34 35
namespace operators {

Y
Yan Chunwei 已提交
36
using inference::Singleton;
N
nhzlx 已提交
37
using inference::tensorrt::TensorRTEngine;
N
nhzlx 已提交
38
using inference::tensorrt::TRTInt8Calibrator;
N
nhzlx 已提交
39 40
using inference::tensorrt::TRTCalibratorEngine;
using inference::tensorrt::TRTCalibratorEngineManager;
N
nhzlx 已提交
41 42 43 44 45

class TensorRTEngineOp : public framework::OperatorBase {
 private:
  std::vector<std::string> input_names_;
  std::unordered_set<std::string> param_names_;
N
nhzlx 已提交
46
  mutable std::unique_ptr<TensorRTEngine> trt_engine_;
N
nhzlx 已提交
47 48
  int max_batch_size_;
  int workspace_size_;
N
nhzlx 已提交
49
  std::unique_ptr<TRTInt8Calibrator> calibrator_;
N
nhzlx 已提交
50
  bool enable_int8_;
N
nhzlx 已提交
51 52
  std::string calibration_data_;
  std::string engine_key_;
N
nhzlx 已提交
53
  std::string engine_serialized_data_;
N
nhzlx 已提交
54
  bool calibration_mode_;
Y
Yan Chunwei 已提交
55

56
 public:
N
nhzlx 已提交
57 58 59 60 61 62 63 64
  TensorRTEngineOp(const std::string &type,
                   const framework::VariableNameMap &inputs,
                   const framework::VariableNameMap &outputs,
                   const framework::AttributeMap &attrs)
      : framework::OperatorBase(type, inputs, outputs, attrs) {
    input_names_ = Inputs("Xs");
    max_batch_size_ = Attr<int>("max_batch_size");
    workspace_size_ = Attr<int>("workspace_size");
N
nhzlx 已提交
65
    enable_int8_ = Attr<bool>("enable_int8");
N
nhzlx 已提交
66 67
    calibration_data_ = Attr<std::string>("calibration_data");
    engine_key_ = Attr<std::string>("engine_key");
N
nhzlx 已提交
68
    engine_serialized_data_ = Attr<std::string>("engine_serialized_data");
N
nhzlx 已提交
69 70 71 72 73

    auto params = Attr<std::vector<std::string>>("parameters");
    for (const auto &param : params) {
      param_names_.insert(param);
    }
N
nhzlx 已提交
74 75 76
    // calibration_mode is ture represents we need to
    // generate the calibration table data.
    calibration_mode_ = (enable_int8_ && calibration_data_.size() == 0);
N
nhzlx 已提交
77

N
nhzlx 已提交
78 79
    VLOG(4) << "calibration_mode: " << calibration_mode_;
    if (enable_int8_ && calibration_data_.size()) {
N
nhzlx 已提交
80 81
      calibrator_.reset(new TRTInt8Calibrator(calibration_data_));
    }
N
nhzlx 已提交
82
  }
83 84

 protected:
N
nhzlx 已提交
85 86
  void RunNativeImpl(const framework::Scope &scope,
                     const platform::Place &dev_place) const {
N
nhzlx 已提交
87 88 89
    framework::Executor executor(dev_place);
    auto *block = Attr<framework::BlockDesc *>("sub_block");
    auto *program = block->Program();
N
nhzlx 已提交
90
    auto &current_scope = scope.NewScope();
N
nhzlx 已提交
91
    auto ctx = executor.Prepare(*program, block->ID());
N
nhzlx 已提交
92
    executor.RunPreparedContext(ctx.get(), &current_scope, false, true, true);
N
nhzlx 已提交
93 94
  }

N
nhzlx 已提交
95 96
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
N
nhzlx 已提交
97 98 99 100
    if (calibration_mode_ == true) {
      RunCalibration(scope, dev_place);
      return;
    }
N
nhzlx 已提交
101
    auto *trt_engine = GetEngine(scope, dev_place);
N
nhzlx 已提交
102
    RunTrt(scope, dev_place, trt_engine);
103 104
  }

N
nhzlx 已提交
105 106
  void RunCalibration(const framework::Scope &scope,
                      const platform::Place &dev_place) const {
N
nhzlx 已提交
107 108 109
    // This process will builds a 32-bit trt engine, runs it on the calibration
    // set, and records a histogram for each
    // tensor of the distribution of activation values.
N
nhzlx 已提交
110 111
    LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_
                         << " is running calibration trt int8... ";
N
nhzlx 已提交
112
    int runtime_batch = 1;
N
nhzlx 已提交
113 114 115
    if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
      TRTCalibratorEngine *calib_res =
          Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
N
nhzlx 已提交
116 117 118 119 120 121 122 123 124 125 126 127
      std::unordered_map<std::string, size_t> calib_buffers;
      for (auto &x : input_names_) {
        if (param_names_.count(x)) continue;
        auto &t =
            inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
        calib_buffers[x] = t.memory_size();
        auto t_shape = framework::vectorize(t.dims());
        runtime_batch = t_shape[0];
      }
      calib_res->calib_.reset(new TRTInt8Calibrator(
          calib_buffers, runtime_batch, engine_key_, dev_place));
      calib_res->thr_.reset(new std::thread([&]() {
N
nhzlx 已提交
128 129 130 131
        calib_res->engine_.reset(new TensorRTEngine(
            max_batch_size_, workspace_size_, enable_int8_,
            calib_res->calib_.get(),
            boost::get<platform::CUDAPlace>(dev_place).device));
N
nhzlx 已提交
132
        VLOG(3) << "start the calib trt engine thread";
133
        PrepareTRTEngine(scope, calib_res->engine_.get());
N
nhzlx 已提交
134 135 136 137
      }));
    }

    TRTInt8Calibrator *temp_calibrator =
N
nhzlx 已提交
138
        Singleton<TRTCalibratorEngineManager>::Global()
N
nhzlx 已提交
139 140 141 142 143 144 145 146 147 148 149
            .Get(engine_key_)
            ->calib_.get();
    std::unordered_map<std::string, void *> calib_data;

    for (auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
      auto &t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
      calib_data.emplace(x, t.data<void>());
    }
    temp_calibrator->setBatch(calib_data);
N
nhzlx 已提交
150
    RunNativeImpl(scope, dev_place);
N
nhzlx 已提交
151 152
  }

N
nhzlx 已提交
153 154
  void RunTrt(const framework::Scope &scope, const platform::Place &dev_place,
              TensorRTEngine *engine) const {
N
nhzlx 已提交
155
    int runtime_batch = 1;
N
nhzlx 已提交
156 157 158 159
    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
    auto &dev_ctx = *pool.Get(dev_place);
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
N
nhzlx 已提交
160 161

    PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
162

163
    std::vector<std::string> output_maps =
N
nhzlx 已提交
164
        Attr<std::vector<std::string>>("output_name_mapping");
165

N
nhzlx 已提交
166 167 168 169 170 171 172 173 174 175
    int num_inputs = 0;

    for (const auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
      num_inputs += 1;
    }
    const int num_bindings = num_inputs + Outputs("Ys").size();
    std::vector<void *> buffers(num_bindings);

    // Bind input tensor to TRT.
N
nhzlx 已提交
176 177
    for (const auto &x : Inputs("Xs")) {
      if (param_names_.count(x)) continue;
178
      // convert input and copy to TRT engine's buffer
N
nhzlx 已提交
179 180 181 182 183
      auto &t =
          inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
      auto t_shape = framework::vectorize(t.dims());
      runtime_batch = t_shape[0];

N
nhzlx 已提交
184 185 186 187 188
      const int bind_index = engine->engine()->getBindingIndex(x.c_str());
      PADDLE_ENFORCE(bind_index < num_bindings,
                     "The bind index should be less than num_bindings");
      buffers[bind_index] = static_cast<void *>(t.data<float>());
    }
189

N
nhzlx 已提交
190
    // Bind output tensor to TRT.
191
    int output_index = 0;
M
minqiyang 已提交
192
    VLOG(4) << "TensorRT Engine Op Outputs:";
N
nhzlx 已提交
193
    for (const auto &y : Outputs("Ys")) {
N
nhzlx 已提交
194 195 196
      const int bind_index =
          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
      auto dims = engine->engine()->getBindingDimensions(bind_index);
197
      // Use the output ITensor's dims to reshape the Fluid Tensor.
198 199
      // The ITensor doesn't contain the batch size dim.
      std::vector<int> ddim;
N
nhzlx 已提交
200
      ddim.push_back(runtime_batch);
201 202 203
      for (int i = 0; i < dims.nbDims; i++) {
        ddim.push_back(dims.d[i]);
      }
N
nhzlx 已提交
204
      auto *fluid_v = scope.FindVar(y);
205
      PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
N
nhzlx 已提交
206
      auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
207
      fluid_t->Resize(framework::make_ddim(ddim));
208

N
nhzlx 已提交
209 210 211 212 213
      PADDLE_ENFORCE(bind_index < num_bindings,
                     "The bind index should be less than num_bindings");
      buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
          boost::get<platform::CUDAPlace>(dev_place)));

214
      output_index += 1;
215
    }
216

N
nhzlx 已提交
217 218
    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
    // Execute the engine.
219
    engine->Execute(runtime_batch, &buffers, stream);
N
nhzlx 已提交
220
    cudaStreamSynchronize(stream);
221 222
  }

N
nhzlx 已提交
223 224
  TensorRTEngine *GetEngine(const framework::Scope &scope,
                            const platform::Place &dev_place) const {
N
nhzlx 已提交
225
    if (!trt_engine_) {
N
nhzlx 已提交
226 227 228
      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
          boost::get<platform::CUDAPlace>(dev_place).device));
N
nhzlx 已提交
229
      if (!engine_serialized_data_.empty()) {
N
nhzlx 已提交
230 231 232 233
        trt_engine_->Deserialize(engine_serialized_data_);
      } else {
        PrepareTRTEngine(scope, trt_engine_.get());
      }
N
nhzlx 已提交
234
    }
N
nhzlx 已提交
235
    return trt_engine_.get();
N
nhzlx 已提交
236 237
  }

238 239
  void PrepareTRTEngine(const framework::Scope &scope,
                        TensorRTEngine *engine) const {
N
nhzlx 已提交
240 241
    LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
                 "kernel etc). This process may cost a lot of time.";
242 243 244
    framework::proto::BlockDesc block_proto;
    block_proto.ParseFromString(Attr<std::string>("subgraph"));
    framework::BlockDesc block_desc(nullptr, &block_proto);
N
nhzlx 已提交
245

246 247
    std::vector<std::string> inputs = Inputs("Xs");
    std::vector<std::string> outputs =
248 249
        Attr<std::vector<std::string>>("output_name_mapping");

N
nhzlx 已提交
250
    inference::Singleton<inference::tensorrt::OpConverter>::Global()
251 252
        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
                                 outputs, engine);
N
nhzlx 已提交
253
  }
254 255 256 257 258 259
};

}  // namespace operators
}  // namespace paddle

#endif  // PADDLE_WITH_CUDA