engine.cc 9.7 KB
Newer Older
Y
Yan Chunwei 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

N
nhzlx 已提交
3 4
Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License.
Y
Yan Chunwei 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/inference/tensorrt/engine.h"

#include <NvInfer.h>
#include <cuda.h>
#include <glog/logging.h>
A
Abhinav Arora 已提交
20
#include <string>
Y
Yan Chunwei 已提交
21
#include "paddle/fluid/inference/analysis/helper.h"
Y
Yan Chunwei 已提交
22 23 24 25 26 27 28
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace inference {
namespace tensorrt {

29 30
int TensorRTEngine::runtime_batch_ = 1;

31
void TensorRTEngine::Build(const DescType &paddle_model) {
Y
Yan Chunwei 已提交
32 33 34 35
  PADDLE_ENFORCE(false, "not implemented");
}

void TensorRTEngine::Execute(int batch_size) {
N
nhzlx 已提交
36
  freshDeviceId();
37 38 39
  batch_size_ = batch_size;
  std::vector<void *> buffers;
  for (auto &buf : buffers_) {
Y
Yan Chunwei 已提交
40 41 42 43 44
    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
    PADDLE_ENFORCE_GT(buf.max_size, 0);
    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
    buffers.push_back(buf.buffer);
  }
45
  PADDLE_ENFORCE_NOT_NULL(stream_);
Y
Yan Chunwei 已提交
46
  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
Y
Yan Chunwei 已提交
47
  cudaStreamSynchronize(*stream_);
48
  SetRuntimeBatch(batch_size);
Y
Yan Chunwei 已提交
49 50 51
}

TensorRTEngine::~TensorRTEngine() {
52
  cudaStreamSynchronize(*stream_);
Y
Yan Chunwei 已提交
53
  // clean buffer
54
  for (auto &buf : buffers_) {
55
    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
Y
Yan Chunwei 已提交
56 57 58
      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
      buf.buffer = nullptr;
      buf.max_size = 0;
Y
Yan Chunwei 已提交
59 60 61 62 63
    }
  }
}

void TensorRTEngine::FreezeNetwork() {
64
  VLOG(3) << "TRT to freeze network";
N
nhzlx 已提交
65
  freshDeviceId();
Y
Yan Chunwei 已提交
66 67 68 69 70 71 72 73 74 75 76 77 78 79
  PADDLE_ENFORCE(infer_builder_ != nullptr,
                 "Call InitNetwork first to initialize network.");
  PADDLE_ENFORCE(infer_network_ != nullptr,
                 "Call InitNetwork first to initialize network.");
  // build engine.
  infer_builder_->setMaxBatchSize(max_batch_);
  infer_builder_->setMaxWorkspaceSize(max_workspace_);

  infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");

  infer_context_.reset(infer_engine_->createExecutionContext());

  // allocate GPU buffers.
Y
Yan Chunwei 已提交
80
  buffers_.resize(buffer_sizes_.size());
81 82 83
  for (auto &item : buffer_sizes_) {
    // The output buffers are not set in the network building phrase, need to
    // infer from the TesorRT network.
Y
Yan Chunwei 已提交
84 85
    if (item.second == 0) {
      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
Y
Yan Chunwei 已提交
86
      auto dims = infer_engine_->getBindingDimensions(slot_offset);
Y
Yan Chunwei 已提交
87 88
      item.second = kDataTypeSize[static_cast<int>(
                        infer_engine_->getBindingDataType(slot_offset))] *
89
                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
90
      PADDLE_ENFORCE_GT(item.second, 0);
Y
Yan Chunwei 已提交
91
    }
92 93 94

    auto &buf = buffer(item.first);
    buf.max_size = item.second * max_batch_;
Y
Yan Chunwei 已提交
95
    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
N
nhzlx 已提交
96

97
    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
98
    buf.size = 0;
N
nhzlx 已提交
99
    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
Y
Yan Chunwei 已提交
100
    buf.device = DeviceType::GPU;
Y
Yan Chunwei 已提交
101 102 103
  }
}

104
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
Y
Yan Chunwei 已提交
105
                                                nvinfer1::DataType dtype,
106
                                                const nvinfer1::Dims &dims) {
Y
Yan Chunwei 已提交
107 108 109 110
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                    name);

  PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
111
  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
Y
Yan Chunwei 已提交
112
  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
Y
Yan Chunwei 已提交
113
  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
114
                        analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
115
  PADDLE_ENFORCE(input->isNetworkInput());
L
Luo Tao 已提交
116
  TensorRTEngine::SetITensor(name, input);
Y
Yan Chunwei 已提交
117 118 119
  return input;
}

120 121
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
                                   const std::string &name) {
Y
Yan Chunwei 已提交
122 123 124
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

125
  auto *output = layer->getOutput(offset);
126
  SetITensor(name, output);
Y
Yan Chunwei 已提交
127 128
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
129
  PADDLE_ENFORCE(!output->isNetworkInput());
Y
Yan Chunwei 已提交
130
  infer_network_->markOutput(*output);
131
  PADDLE_ENFORCE(output->isNetworkOutput());
Y
Yan Chunwei 已提交
132 133 134 135 136
  // output buffers' size can only be decided latter, set zero here to mark this
  // and will reset latter.
  buffer_sizes_[name] = 0;
}

N
nhzlx 已提交
137 138 139 140
bool TensorRTEngine::HasDeclared(const std::string &name) {
  return buffer_sizes_.count(name) > 0;
}

141
void TensorRTEngine::DeclareOutput(const std::string &name) {
L
Luo Tao 已提交
142 143 144
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

145
  auto *output = TensorRTEngine::GetITensor(name);
L
Luo Tao 已提交
146 147
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
148
  PADDLE_ENFORCE(!output->isNetworkInput());
L
Luo Tao 已提交
149 150 151 152 153 154
  infer_network_->markOutput(*output);
  // output buffers' size can only be decided latter, set zero here to mark this
  // and will reset latter.
  buffer_sizes_[name] = 0;
}

155
void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
Y
Yan Chunwei 已提交
156
  return buffer(name).buffer;
Y
Yan Chunwei 已提交
157 158
}

N
nhzlx 已提交
159 160
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
                                    size_t max_size) {
161
  // determine data size
N
nhzlx 已提交
162
  auto *output = TensorRTEngine::GetITensor(name);
163 164 165 166 167
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
                    kDataTypeSize[static_cast<int>(output->getType())];

168 169 170
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
171
  PADDLE_ENFORCE_LE(dst_size, it->second);
N
nhzlx 已提交
172
  PADDLE_ENFORCE_GE(max_size, dst_size);
173
  auto &buf = buffer(name);
174
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
175
  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
176 177 178 179
                                    cudaMemcpyDeviceToDevice, *stream_),
                    0);
}

N
nhzlx 已提交
180 181
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
                                    size_t max_size) {
Y
Yan Chunwei 已提交
182
  // determine data size
183

N
nhzlx 已提交
184
  auto *output = TensorRTEngine::GetITensor(name);
185 186 187 188
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
                    kDataTypeSize[static_cast<int>(output->getType())];
Y
Yan Chunwei 已提交
189 190 191
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
192
  PADDLE_ENFORCE_LE(dst_size, it->second);
N
nhzlx 已提交
193
  PADDLE_ENFORCE_GE(max_size, dst_size);
N
nhzlx 已提交
194
  auto &buf = buffer(name);
Y
Yan Chunwei 已提交
195
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
196
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
Y
Yan Chunwei 已提交
197 198 199
                                       cudaMemcpyDeviceToHost, *stream_));
}

200
Buffer &TensorRTEngine::buffer(const std::string &name) {
Y
Yan Chunwei 已提交
201 202
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
203 204
  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
                 name);
Y
Yan Chunwei 已提交
205 206 207 208
  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
  return buffers_[slot_offset];
}

209
void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
Y
Yan Chunwei 已提交
210
                                     size_t size) {
211
  auto &buf = buffer(name);
Y
Yan Chunwei 已提交
212
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
213 214
  PADDLE_ENFORCE_NOT_NULL(data);
  PADDLE_ENFORCE_NOT_NULL(stream_);
Y
Yan Chunwei 已提交
215 216
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
217
  buf.size = size;
Y
Yan Chunwei 已提交
218 219
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyHostToDevice, *stream_));
Y
Yan Chunwei 已提交
220 221
}

222
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
223
                                     size_t size) {
224 225
  auto &buf = buffer(name);
  buf.size = size;
226 227 228 229 230 231 232
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyDeviceToDevice, *stream_));
}

233 234
void TensorRTEngine::SetITensor(const std::string &name,
                                nvinfer1::ITensor *tensor) {
L
Luo Tao 已提交
235
  PADDLE_ENFORCE(tensor != nullptr);
Y
Yan Chunwei 已提交
236
  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
L
Luo Tao 已提交
237 238 239 240
                    name);
  itensor_map_[name] = tensor;
}

241
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
Y
Yan Chunwei 已提交
242
  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
L
Luo Tao 已提交
243 244 245
  return itensor_map_[name];
}

246 247 248 249 250 251
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
  runtime_batch_ = batch_size;
}

int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }

N
nhzlx 已提交
252 253 254 255 256 257 258
void TensorRTEngine::freshDeviceId() {
  int count;
  cudaGetDeviceCount(&count);
  PADDLE_ENFORCE_LT(device_, count);
  cudaSetDevice(device_);
}

N
nhzlx 已提交
259
nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
260 261 262 263 264
    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
  owned_plugin_.emplace_back(plugin);
  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
}

Y
Yan Chunwei 已提交
265 266 267
}  // namespace tensorrt
}  // namespace inference
}  // namespace paddle