engine.cc 9.7 KB
Newer Older
Y
Yan Chunwei 已提交
1 2
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

N
nhzlx 已提交
3 4
Licensed under the Apache License, Version 2.0 (the "License"); you may not use
this file except in compliance with the License.
Y
Yan Chunwei 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/inference/tensorrt/engine.h"

#include <NvInfer.h>
#include <cuda.h>
#include <glog/logging.h>
A
Abhinav Arora 已提交
20
#include <string>
Y
Yan Chunwei 已提交
21
#include "paddle/fluid/inference/analysis/helper.h"
Y
Yan Chunwei 已提交
22 23 24 25 26 27 28
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/platform/enforce.h"

namespace paddle {
namespace inference {
namespace tensorrt {

29 30
int TensorRTEngine::runtime_batch_ = 1;

31
void TensorRTEngine::Build(const DescType &paddle_model) {
Y
Yan Chunwei 已提交
32 33 34 35
  PADDLE_ENFORCE(false, "not implemented");
}

void TensorRTEngine::Execute(int batch_size) {
N
nhzlx 已提交
36
  freshDeviceId();
37 38 39
  batch_size_ = batch_size;
  std::vector<void *> buffers;
  for (auto &buf : buffers_) {
Y
Yan Chunwei 已提交
40 41 42 43 44
    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
    PADDLE_ENFORCE_GT(buf.max_size, 0);
    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
    buffers.push_back(buf.buffer);
  }
45
  PADDLE_ENFORCE_NOT_NULL(stream_);
Y
Yan Chunwei 已提交
46
  infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
Y
Yan Chunwei 已提交
47
  cudaStreamSynchronize(*stream_);
48
  SetRuntimeBatch(batch_size);
Y
Yan Chunwei 已提交
49 50 51
}

TensorRTEngine::~TensorRTEngine() {
52
  cudaStreamSynchronize(*stream_);
Y
Yan Chunwei 已提交
53
  // clean buffer
54
  for (auto &buf : buffers_) {
55
    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
Y
Yan Chunwei 已提交
56 57 58
      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
      buf.buffer = nullptr;
      buf.max_size = 0;
Y
Yan Chunwei 已提交
59 60 61 62 63
    }
  }
}

void TensorRTEngine::FreezeNetwork() {
N
nhzlx 已提交
64
  freshDeviceId();
Y
Yan Chunwei 已提交
65 66 67 68 69 70 71 72 73 74 75 76 77 78
  PADDLE_ENFORCE(infer_builder_ != nullptr,
                 "Call InitNetwork first to initialize network.");
  PADDLE_ENFORCE(infer_network_ != nullptr,
                 "Call InitNetwork first to initialize network.");
  // build engine.
  infer_builder_->setMaxBatchSize(max_batch_);
  infer_builder_->setMaxWorkspaceSize(max_workspace_);

  infer_engine_.reset(infer_builder_->buildCudaEngine(*infer_network_));
  PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");

  infer_context_.reset(infer_engine_->createExecutionContext());

  // allocate GPU buffers.
Y
Yan Chunwei 已提交
79
  buffers_.resize(buffer_sizes_.size());
80 81 82
  for (auto &item : buffer_sizes_) {
    // The output buffers are not set in the network building phrase, need to
    // infer from the TesorRT network.
Y
Yan Chunwei 已提交
83 84
    if (item.second == 0) {
      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
Y
Yan Chunwei 已提交
85
      auto dims = infer_engine_->getBindingDimensions(slot_offset);
Y
Yan Chunwei 已提交
86 87
      item.second = kDataTypeSize[static_cast<int>(
                        infer_engine_->getBindingDataType(slot_offset))] *
88
                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
89
      PADDLE_ENFORCE_GT(item.second, 0);
Y
Yan Chunwei 已提交
90
    }
91 92 93

    auto &buf = buffer(item.first);
    buf.max_size = item.second * max_batch_;
Y
Yan Chunwei 已提交
94
    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
N
nhzlx 已提交
95

96
    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
97
    buf.size = 0;
N
nhzlx 已提交
98
    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
Y
Yan Chunwei 已提交
99
    buf.device = DeviceType::GPU;
Y
Yan Chunwei 已提交
100 101 102
  }
}

103
nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
Y
Yan Chunwei 已提交
104
                                                nvinfer1::DataType dtype,
105
                                                const nvinfer1::Dims &dims) {
Y
Yan Chunwei 已提交
106 107 108 109
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
                    name);

  PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
110
  auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
Y
Yan Chunwei 已提交
111
  PADDLE_ENFORCE(input, "infer network add input %s failed", name);
Y
Yan Chunwei 已提交
112
  buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
113
                        analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
114
  PADDLE_ENFORCE(input->isNetworkInput());
L
Luo Tao 已提交
115
  TensorRTEngine::SetITensor(name, input);
Y
Yan Chunwei 已提交
116 117 118
  return input;
}

119 120
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
                                   const std::string &name) {
Y
Yan Chunwei 已提交
121 122 123
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

124
  auto *output = layer->getOutput(offset);
125
  SetITensor(name, output);
Y
Yan Chunwei 已提交
126 127
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
128
  PADDLE_ENFORCE(!output->isNetworkInput());
Y
Yan Chunwei 已提交
129
  infer_network_->markOutput(*output);
130
  PADDLE_ENFORCE(output->isNetworkOutput());
Y
Yan Chunwei 已提交
131 132 133 134 135
  // output buffers' size can only be decided latter, set zero here to mark this
  // and will reset latter.
  buffer_sizes_[name] = 0;
}

N
nhzlx 已提交
136 137 138 139
bool TensorRTEngine::HasDeclared(const std::string &name) {
  return buffer_sizes_.count(name) > 0;
}

140
void TensorRTEngine::DeclareOutput(const std::string &name) {
L
Luo Tao 已提交
141 142 143
  PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
                    name);

144
  auto *output = TensorRTEngine::GetITensor(name);
L
Luo Tao 已提交
145 146
  PADDLE_ENFORCE(output != nullptr);
  output->setName(name.c_str());
147
  PADDLE_ENFORCE(!output->isNetworkInput());
L
Luo Tao 已提交
148 149 150 151 152 153
  infer_network_->markOutput(*output);
  // output buffers' size can only be decided latter, set zero here to mark this
  // and will reset latter.
  buffer_sizes_[name] = 0;
}

154
void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
Y
Yan Chunwei 已提交
155
  return buffer(name).buffer;
Y
Yan Chunwei 已提交
156 157
}

N
nhzlx 已提交
158 159
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
                                    size_t max_size) {
160
  // determine data size
N
nhzlx 已提交
161
  auto *output = TensorRTEngine::GetITensor(name);
162 163 164 165 166
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
                    kDataTypeSize[static_cast<int>(output->getType())];

167 168 169
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
170
  PADDLE_ENFORCE_LE(dst_size, it->second);
N
nhzlx 已提交
171
  PADDLE_ENFORCE_GE(max_size, dst_size);
172
  auto &buf = buffer(name);
173
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
174
  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
175 176 177 178
                                    cudaMemcpyDeviceToDevice, *stream_),
                    0);
}

N
nhzlx 已提交
179 180
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
                                    size_t max_size) {
Y
Yan Chunwei 已提交
181
  // determine data size
182

N
nhzlx 已提交
183
  auto *output = TensorRTEngine::GetITensor(name);
184 185 186 187
  nvinfer1::Dims dims = output->getDimensions();
  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
  size_t dst_size = dim_size * runtime_batch_ *
                    kDataTypeSize[static_cast<int>(output->getType())];
Y
Yan Chunwei 已提交
188 189 190
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  PADDLE_ENFORCE_GT(it->second, 0);
191
  PADDLE_ENFORCE_LE(dst_size, it->second);
N
nhzlx 已提交
192
  PADDLE_ENFORCE_GE(max_size, dst_size);
N
nhzlx 已提交
193
  auto &buf = buffer(name);
Y
Yan Chunwei 已提交
194
  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
195
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
Y
Yan Chunwei 已提交
196 197 198
                                       cudaMemcpyDeviceToHost, *stream_));
}

199
Buffer &TensorRTEngine::buffer(const std::string &name) {
Y
Yan Chunwei 已提交
200 201 202 203 204 205 206
  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
  auto it = buffer_sizes_.find(name);
  PADDLE_ENFORCE(it != buffer_sizes_.end());
  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
  return buffers_[slot_offset];
}

207
void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
Y
Yan Chunwei 已提交
208
                                     size_t size) {
209
  auto &buf = buffer(name);
Y
Yan Chunwei 已提交
210
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
211 212
  PADDLE_ENFORCE_NOT_NULL(data);
  PADDLE_ENFORCE_NOT_NULL(stream_);
Y
Yan Chunwei 已提交
213 214
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
215
  buf.size = size;
Y
Yan Chunwei 已提交
216 217
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyHostToDevice, *stream_));
Y
Yan Chunwei 已提交
218 219
}

220
void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
221
                                     size_t size) {
222 223
  auto &buf = buffer(name);
  buf.size = size;
224 225 226 227 228 229 230
  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
                                       cudaMemcpyDeviceToDevice, *stream_));
}

231 232
void TensorRTEngine::SetITensor(const std::string &name,
                                nvinfer1::ITensor *tensor) {
L
Luo Tao 已提交
233
  PADDLE_ENFORCE(tensor != nullptr);
Y
Yan Chunwei 已提交
234
  PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
L
Luo Tao 已提交
235 236 237 238
                    name);
  itensor_map_[name] = tensor;
}

239
nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
Y
Yan Chunwei 已提交
240
  PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
L
Luo Tao 已提交
241 242 243
  return itensor_map_[name];
}

244 245 246 247 248 249
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
  runtime_batch_ = batch_size;
}

int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }

N
nhzlx 已提交
250 251 252 253 254 255 256
void TensorRTEngine::freshDeviceId() {
  int count;
  cudaGetDeviceCount(&count);
  PADDLE_ENFORCE_LT(device_, count);
  cudaSetDevice(device_);
}

257 258 259 260 261 262
nvinfer1::IPluginLayer *TensorRTEngine::addPlugin(
    nvinfer1::ITensor *const *inputs, int nbInputs, PluginTensorRT *plugin) {
  owned_plugin_.emplace_back(plugin);
  return infer_network_.get()->addPluginExt(inputs, nbInputs, *plugin);
}

Y
Yan Chunwei 已提交
263 264 265
}  // namespace tensorrt
}  // namespace inference
}  // namespace paddle