engine.cc 17.6 KB
Newer Older
Y
Yan Chunwei 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

N
nhzlx 已提交
3 4
Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License.
Y
Yan Chunwei 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/inference/tensorrt/engine.h"

#include <NvInfer.h>
#include <glog/logging.h>
19

A
Abhinav Arora 已提交
20
#include <string>
W
wanghuancoder 已提交
21

22
#include "cuda_runtime_api.h"  // NOLINT
Y
Yan Chunwei 已提交
23
#include "paddle/fluid/inference/tensorrt/helper.h"
24
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
Y
Yan Chunwei 已提交
25 26 27 28 29 30
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace inference {
namespace tensorrt {

31 32
int TensorRTEngine::runtime_batch_ = 1;

33 34 35 36 37
void TensorRTEngine::InitNetwork() {
  freshDeviceId();
  infer_builder_.reset(createInferBuilder(&logger_));

  if (with_dynamic_shape_) {
38
    infer_network_.reset(infer_builder_->createNetworkV2(
39 40 41
        1U << static_cast<int>(
            nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)));
  } else {
42
    infer_network_.reset(infer_builder_->createNetworkV2(0U));
43
  }
44 45

  infer_builder_config_.reset(infer_builder_->createBuilderConfig());
W
wenbin 已提交
46 47 48 49
  // optim_profile_ = infer_builder_->createOptimizationProfile();
  optim_profiles_.resize(max_profile_num_);
  for (int i = 0; i < max_profile_num_; i++)
    optim_profiles_[i] = infer_builder_->createOptimizationProfile();
Y
Yan Chunwei 已提交
50 51
}

52 53
void TensorRTEngine::Execute(int batch_size,
                             std::vector<void *> *buffers,
54
                             cudaStream_t stream) {
N
nhzlx 已提交
55
  freshDeviceId();
56 57 58 59 60 61 62
  auto infer_context = context();
  if (!with_dynamic_shape()) {
    infer_context->enqueue(batch_size, buffers->data(), stream, nullptr);
  } else {
#if IS_TRT_VERSION_GE(6000)
    infer_context->enqueueV2(buffers->data(), stream, nullptr);
#endif
63
  }
N
nhzlx 已提交
64 65 66
  SetRuntimeBatch(batch_size);
}

Y
Yan Chunwei 已提交
67
void TensorRTEngine::FreezeNetwork() {
N
nhzlx 已提交
68
  freshDeviceId();
69
  VLOG(3) << "TRT to freeze network";
70 71 72 73 74 75 76
  PADDLE_ENFORCE_NOT_NULL(infer_builder_,
                          platform::errors::InvalidArgument(
                              "Inference builder of TRT is null. Please make "
                              "sure you call InitNetwork first."));
  PADDLE_ENFORCE_NOT_NULL(network(),
                          platform::errors::InvalidArgument(
                              "Call InitNetwork first to initialize network."));
Y
Yan Chunwei 已提交
77 78
  // build engine.
  infer_builder_->setMaxBatchSize(max_batch_);
79 80
  infer_builder_config_->setMaxWorkspaceSize(max_workspace_);

Z
Zhaolong Xing 已提交
81 82 83
  bool enable_fp16 = (precision_ == AnalysisConfig::Precision::kHalf);
  if (enable_fp16) {
    bool support_fp16 = infer_builder_->platformHasFastFp16();
84
    infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
Z
Zhaolong Xing 已提交
85 86 87
    if (!support_fp16) {
      LOG(INFO) << "You specify FP16 mode, but the hardware do not support "
                   "FP16 speed up, use FP32 instead.";
88 89
    } else {
      LOG(INFO) << "Run Paddle-TRT FP16 mode";
Z
Zhaolong Xing 已提交
90 91 92
    }
  }

93
  bool enable_int8 = (precision_ == AnalysisConfig::Precision::kInt8);
Z
Zhaolong Xing 已提交
94
  if (enable_int8) {
C
csy0225 已提交
95 96 97
    if (!use_dla_) {
      infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16);
    }
98 99
    infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8);

100
    if (calibrator_) {
101
      infer_builder_config_->setInt8Calibrator(calibrator_);
102
    } else {
103
      infer_builder_config_->setInt8Calibrator(nullptr);
104 105 106 107 108 109 110 111 112

#if IS_TRT_VERSION_GE(5000)
      for (auto &quant_range : quant_dynamic_range_) {
        auto tensor = quant_range.first;
        float range = quant_range.second;
        tensor->setDynamicRange(-range, range);
      }

      std::unordered_set<nvinfer1::ITensor *> all_t;
113 114
      for (int i = 0; i < network()->getNbLayers(); i++) {
        auto layer = network()->getLayer(i);
115 116 117 118
        for (int j = 0; j < layer->getNbOutputs(); j++) {
          all_t.insert(layer->getOutput(j));
        }
      }
119

120 121
      for (int i = 0; i < network()->getNbInputs(); i++) {
        all_t.insert(network()->getInput(i));
122 123 124 125
      }

      for (auto &t : all_t) {
        if (!quant_dynamic_range_.count(t)) {
T
tianshuo78520a 已提交
126 127 128
          VLOG(3) << "We are in trt int8 mode(not calibration), scale not set"
                  << " for tensor " << t->getName()
                  << ", this might be ok when trt does not need this range";
129 130
        }
      }
131

132
#if IS_TRT_VERSION_GE(5122)
133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151
      auto layer_int8_fallback = [&](nvinfer1::ILayer *layer) -> bool {
        if (layer->getType() == nvinfer1::LayerType::kSHAPE) {
          return false;
        }
        bool all_int = true;
        for (int j = 0; j < layer->getNbInputs(); j++) {
          auto *temp_in = layer->getInput(j);
          if (temp_in->getType() != nvinfer1::DataType::kINT32) {
            all_int = false;
          }
        }
        for (int j = 0; j < layer->getNbOutputs(); j++) {
          auto *temp_out = layer->getOutput(j);
          if (temp_out->getType() != nvinfer1::DataType::kINT32) {
            all_int = false;
          }
        }
        if (all_int) return false;

152 153 154 155 156 157
        for (int j = 0; j < layer->getNbInputs(); j++) {
          auto *temp_in = layer->getInput(j);
          if (!temp_in->dynamicRangeIsSet()) {
            VLOG(1) << "Layer(Name: " << layer->getName()
                    << ") is set to float32 because its input("
                    << temp_in->getName() << ") doesn't have dynamic range.";
158
            return true;
159 160
          }
        }
161 162
        for (int j = 0; j < layer->getNbOutputs(); j++) {
          auto *temp_out = layer->getOutput(j);
163 164 165 166
          if (!temp_out->dynamicRangeIsSet()) {
            VLOG(1) << "Layer(Name: " << layer->getName()
                    << ") is set to float32 because its output("
                    << temp_out->getName() << ") doesn't have dynamic range.";
167
            return true;
168 169
          }
        }
170
        return false;
171 172 173 174 175
      };
      // If a layer's output is the network's output, or not all of its inputs
      // and outputs have scales,
      // this layer's precision and output type are set to float32.
      // This step has no effect if this layer is fused during TRT optimization.
176
      int layers_no_int8 = 0;
177 178
      for (int i = 0; i < network()->getNbLayers(); i++) {
        auto layer = network()->getLayer(i);
179
        if (layer_int8_fallback(layer)) {
180
          layer->setPrecision(nvinfer1::DataType::kFLOAT);
181
          ++layers_no_int8;
182
        }
183
      }
184 185 186 187 188 189 190
      // Disable int8 or build engine failed if all layers aren't int8
      if (layers_no_int8 == network()->getNbLayers()) {
        nvinfer1::BuilderFlags flags = infer_builder_config_->getFlags();
        flags = flags & ~(1U << static_cast<int>(nvinfer1::BuilderFlag::kINT8));
        // reset flags
        infer_builder_config_->setFlags(flags);
      }
191 192 193 194 195
#else
      LOG(WARNING) << "If your TensorRT version is lower than 5.1.2.2, you "
                      "must provide quantization scales for all tensors using "
                      "TRT to run.";
#endif
196 197
#endif
    }
N
nhzlx 已提交
198
  }
Y
Yan Chunwei 已提交
199

200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
  if (use_dla_) {
    if (!enable_int8 && !enable_fp16) {
      LOG(WARNING) << "TensorRT DLA must be used with int8 or fp16, but you "
                      "set float32, so DLA is not used.";
    } else if (infer_builder_->getNbDLACores() == 0) {
      LOG(WARNING)
          << "TensorRT DLA is set by config, but your device does not have "
             "DLA, so DLA is not used.";
    } else {
      if (dla_core_ < 0 || dla_core_ >= infer_builder_->getNbDLACores()) {
        dla_core_ = 0;
        LOG(WARNING) << "Invalid DLACore, must be 0 < DLACore < "
                     << infer_builder_->getNbDLACores() << ", but got "
                     << dla_core_ << ", so use use 0 as default.";
      }
215 216 217
      infer_builder_config_->setDefaultDeviceType(nvinfer1::DeviceType::kDLA);
      infer_builder_config_->setDLACore(dla_core_);
      infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kGPU_FALLBACK);
218 219 220 221 222
      LOG(INFO) << "TensorRT DLA enabled in FreezeNetwork(), DLACore "
                << dla_core_;
    }
  }

223 224
  if (with_dynamic_shape_) {
#if IS_TRT_VERSION_GE(6000)
225
    LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode.";
W
wenbin 已提交
226 227
    for (int i = 0; i < max_profile_num_; i++) {
      for (auto &input : min_input_shape_) {
228
#if IS_TRT_VERSION_LT(7000)
W
wenbin 已提交
229
        // trt6 will check all_of input > 0
230 231
        if (!(std::all_of(input.second.begin(),
                          input.second.end(),
W
wenbin 已提交
232 233 234 235 236 237 238 239 240
                          [](int x) { return x > 0; }) &&
              std::all_of(max_input_shape_[input.first].begin(),
                          max_input_shape_[input.first].end(),
                          [](int x) { return x > 0; }) &&
              std::all_of(optim_input_shape_[input.first].begin(),
                          optim_input_shape_[input.first].end(),
                          [](int x) { return x > 0; }))) {
          continue;
        }
241
#endif
W
wenbin 已提交
242 243 244 245 246 247
        VLOG(4) << "TRT dynamic_shape set " << input.first
                << " min: " << Vec2Str(input.second)
                << ", max: " << Vec2Str(max_input_shape_[input.first])
                << ", opt: " << Vec2Str(optim_input_shape_[input.first]);

        optim_profiles_[i]->setDimensions(
248 249
            input.first.c_str(),
            nvinfer1::OptProfileSelector::kMIN,
W
wenbin 已提交
250 251
            Vec2TRT_Dims(input.second, input.first, true));
        optim_profiles_[i]->setDimensions(
252 253
            input.first.c_str(),
            nvinfer1::OptProfileSelector::kMAX,
W
wenbin 已提交
254 255
            Vec2TRT_Dims(max_input_shape_[input.first], input.first, true));
        optim_profiles_[i]->setDimensions(
256 257
            input.first.c_str(),
            nvinfer1::OptProfileSelector::kOPT,
W
wenbin 已提交
258 259 260
            Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
      }
      infer_builder_config_->addOptimizationProfile(optim_profiles_[i]);
261
    }
262 263 264 265 266 267
    if (WithFp16() && disable_trt_plugin_fp16()) {
      LOG(INFO) << "NOTE: In order to achieve higher accuracy, you have "
                   "disabled the fp16 mode of TRT Plugin,\n"
                << "you can reopen it with "
                   "'config.SetDynamicShapeInfo(min_shape, max_shape, "
                   "opt_shape, false /*disable_trt_plugin_fp16*/)'";
268
    }
269 270
#endif
  }
271
#if IS_TRT_VERSION_GE(8200)
272 273 274 275
  if (use_inspector_) {
    infer_builder_config_->setProfilingVerbosity(
        nvinfer1::ProfilingVerbosity::kDETAILED);
  }
276 277
#endif

278
#if IS_TRT_VERSION_LT(8000)
279 280
  infer_engine_.reset(infer_builder_->buildEngineWithConfig(
      *network(), *infer_builder_config_));
281
#else
J
JingZhuangzhuang 已提交
282
  infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS);
Z
zlsh80826 已提交
283
  ihost_memory_.reset(infer_builder_->buildSerializedNetwork(
284 285
      *network(), *infer_builder_config_));
  infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
Z
zlsh80826 已提交
286 287
  infer_engine_.reset(runtime->deserializeCudaEngine(ihost_memory_->data(),
                                                     ihost_memory_->size()));
288
#endif
289

290
  PADDLE_ENFORCE_NOT_NULL(
291 292 293 294
      infer_engine_,
      platform::errors::Fatal(
          "Build TensorRT cuda engine failed! Please recheck "
          "you configurations related to paddle-TensorRT."));
295

W
wenbin 已提交
296 297 298 299 300 301 302
  binding_num_ = infer_engine_->getNbBindings();
  // reset status for dynamic shape clone
  if (max_profile_num_ > 1) {
    infer_context_.clear();
    cur_profile_num_ = 0;
  }

303
  GetEngineInfo();
Y
Yan Chunwei 已提交
304 305
}

306
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
Y
Yan Chunwei 已提交
307
                                                nvinfer1::DataType dtype,
308
                                                const nvinfer1::Dims &dims) {
309 310
  PADDLE_ENFORCE_EQ(network() != nullptr,
                    true,
311 312 313
                    platform::errors::InvalidArgument(
                        "The TRT network should be initialized first."));
  auto *input = network()->addInput(name.c_str(), dtype, dims);
314
  PADDLE_ENFORCE_NOT_NULL(
315 316 317 318 319 320 321
      input,
      platform::errors::InvalidArgument("Adding input %s failed in "
                                        "TensorRT inference network. "
                                        "Please recheck your input.",
                                        name));
  PADDLE_ENFORCE_EQ(input->isNetworkInput(),
                    true,
322 323 324 325
                    platform::errors::InvalidArgument(
                        "Input %s is not the input of TRT inference network. "
                        "Please recheck your input.",
                        name));
L
Luo Tao 已提交
326
  TensorRTEngine::SetITensor(name, input);
Y
Yan Chunwei 已提交
327 328 329
  return input;
}

330 331
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer,
                                   int offset,
332 333
                                   const std::string &name) {
  auto *output = layer->getOutput(offset);
334
  SetITensor(name, output);
335
  PADDLE_ENFORCE_NOT_NULL(
336 337 338
      output,
      platform::errors::InvalidArgument(
          "The output %s of TRT engine should not be null.", name));
Y
Yan Chunwei 已提交
339
  output->setName(name.c_str());
340 341
  PADDLE_ENFORCE_EQ(output->isNetworkInput(),
                    false,
342 343 344 345
                    platform::errors::InvalidArgument(
                        "The output %s of TRT engine should not be the input "
                        "of the network at the same time.",
                        name));
346
  network()->markOutput(*output);
347
  PADDLE_ENFORCE_EQ(
348 349
      output->isNetworkOutput(),
      true,
350 351 352
      platform::errors::InvalidArgument(
          "The output %s of TRT engine should be the output of the network.",
          name));
N
nhzlx 已提交
353 354
}

355 356
void TensorRTEngine::DeclareOutput(const std::string &name) {
  auto *output = TensorRTEngine::GetITensor(name);
357
  PADDLE_ENFORCE_NOT_NULL(
358 359 360
      output,
      platform::errors::InvalidArgument(
          "The output %s of TRT engine should not be null.", name));
L
Luo Tao 已提交
361
  output->setName(name.c_str());
362 363
  PADDLE_ENFORCE_EQ(output->isNetworkInput(),
                    false,
364 365 366 367
                    platform::errors::InvalidArgument(
                        "The output %s of TRT engine should not be the input "
                        "of the network at the same time.",
                        name));
368
  network()->markOutput(*output);
L
Luo Tao 已提交
369 370
}

371 372
void TensorRTEngine::SetITensor(const std::string &name,
                                nvinfer1::ITensor *tensor) {
373
  PADDLE_ENFORCE_NOT_NULL(
374 375 376
      tensor,
      platform::errors::InvalidArgument(
          "Tensor named %s of TRT engine should not be null.", name));
377
  PADDLE_ENFORCE_EQ(
378 379
      0,
      itensor_map_.count(name),
380 381
      platform::errors::InvalidArgument(
          "Tensor named %s of TRT engine should not be duplicated", name));
L
Luo Tao 已提交
382 383 384
  itensor_map_[name] = tensor;
}

385
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
386 387
  PADDLE_ENFORCE_EQ(itensor_map_.count(name),
                    true,
388 389
                    platform::errors::NotFound(
                        "Tensor named %s is not found in TRT engine", name));
L
Luo Tao 已提交
390 391 392
  return itensor_map_[name];
}

393 394 395 396 397
std::unordered_map<std::string, nvinfer1::ITensor *>
    *TensorRTEngine::GetITensorMap() {
  return &itensor_map_;
}

398 399 400 401
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
  runtime_batch_ = batch_size;
}

402 403 404 405
template <typename T = float>
T *TensorRTEngine::GetWeightCPUData(const std::string &name,
                                    framework::Tensor *weight_tensor) {
  std::unique_ptr<framework::Tensor> cpu_weight_tensor(new framework::Tensor());
406
  platform::CPUPlace cpu_place;
407
  cpu_weight_tensor->Resize(weight_tensor->dims());
408
  paddle::framework::TensorCopySync(
409 410 411
      *weight_tensor, cpu_place, cpu_weight_tensor.get());
  T *weight_data = cpu_weight_tensor->mutable_data<T>(cpu_place);
  SetWeights(name, std::move(cpu_weight_tensor));
412 413 414
  return weight_data;
}

415 416 417 418 419 420 421 422
template float *TensorRTEngine::GetWeightCPUData(
    const std::string &name, framework::Tensor *weight_tensor);
template int32_t *TensorRTEngine::GetWeightCPUData(
    const std::string &name, framework::Tensor *weight_tensor);

template int64_t *TensorRTEngine::GetWeightCPUData(
    const std::string &name, framework::Tensor *weight_tensor);

423 424
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }

425
nvinfer1::IPluginV2Layer *TensorRTEngine::AddPlugin(
426 427
    nvinfer1::ITensor *const *inputs,
    int num_inputs,
428
    plugin::PluginTensorRT *plugin) {
429
  owned_plugin_.emplace_back(plugin);
430
  return network()->addPluginV2(inputs, num_inputs, *plugin);
431 432
}

433
nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext(
434 435
    nvinfer1::ITensor *const *inputs,
    int num_inputs,
436 437 438 439 440
    plugin::PluginTensorRTV2Ext *plugin) {
  owned_plugin_v2ext_.emplace_back(plugin);
  return network()->addPluginV2(inputs, num_inputs, *plugin);
}

441
nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2IOExt(
442 443
    nvinfer1::ITensor *const *inputs,
    int num_inputs,
444 445 446 447 448
    nvinfer1::IPluginV2IOExt *plugin) {
  owned_plugin_v2ioext_.emplace_back(plugin);
  return network()->addPluginV2(inputs, num_inputs, *plugin);
}

N
nhzlx 已提交
449 450 451
void TensorRTEngine::freshDeviceId() {
  int count;
  cudaGetDeviceCount(&count);
452 453
  PADDLE_ENFORCE_LT(device_id_,
                    count,
454 455
                    platform::errors::OutOfRange(
                        "Device id %d exceeds the current device count: %d.",
456 457
                        device_id_,
                        count));
L
Leo Chen 已提交
458
  platform::SetDeviceId(device_id_);
N
nhzlx 已提交
459 460
}

461 462 463 464 465 466 467 468 469 470 471 472 473 474 475
void TensorRTEngine::GetEngineInfo() {
#if IS_TRT_VERSION_GE(8200)
  LOG(INFO) << "====== engine info ======";
  std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
      infer_engine_->createEngineInspector());
  auto infer_context = context();
  infer_inspector->setExecutionContext(infer_context);
  LOG(INFO) << infer_inspector->getEngineInformation(
      nvinfer1::LayerInformationFormat::kONELINE);
  LOG(INFO) << "====== engine info end ======";
#else
  LOG(INFO) << "Inspector needs TensorRT version 8.2 and after.";
#endif
}

Y
Yan Chunwei 已提交
476 477 478
}  // namespace tensorrt
}  // namespace inference
}  // namespace paddle