提交 2372daff 编写于 作者: N nhzlx

there is no batchsize concept in tensorrt's tensor

上级 4a076178
...@@ -26,6 +26,8 @@ namespace paddle { ...@@ -26,6 +26,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
int TensorRTEngine::runtime_batch_ = 1;
void TensorRTEngine::Build(const DescType& paddle_model) { void TensorRTEngine::Build(const DescType& paddle_model) {
PADDLE_ENFORCE(false, "not implemented"); PADDLE_ENFORCE(false, "not implemented");
} }
...@@ -40,6 +42,7 @@ void TensorRTEngine::Execute(int batch_size) { ...@@ -40,6 +42,7 @@ void TensorRTEngine::Execute(int batch_size) {
} }
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
cudaStreamSynchronize(*stream_); cudaStreamSynchronize(*stream_);
SetRuntimeBatch(batch_size);
} }
TensorRTEngine::~TensorRTEngine() { TensorRTEngine::~TensorRTEngine() {
...@@ -76,14 +79,15 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -76,14 +79,15 @@ void TensorRTEngine::FreezeNetwork() {
auto dims = infer_engine_->getBindingDimensions(slot_offset); auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>( item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] * infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
} }
auto& buf = buffer(item.first); auto& buf = buffer(item.first);
CHECK(buf.buffer == nullptr); // buffer should be allocated only once. CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second)); PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
VLOG(4) << "buffer malloc " << item.first << " " << item.second << " " VLOG(4) << "buffer malloc " << item.first << " " << item.second << " "
<< buf.buffer; << buf.buffer;
buf.size = buf.max_size = item.second; buf.size = item.second;
buf.max_size = item.second * max_batch_;
buf.device = DeviceType::GPU; buf.device = DeviceType::GPU;
} }
} }
...@@ -98,7 +102,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, ...@@ -98,7 +102,7 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
auto* input = infer_network_->addInput(name.c_str(), dtype, dims); auto* input = infer_network_->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name); PADDLE_ENFORCE(input, "infer network add input %s failed", name);
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE(input->isNetworkInput()); PADDLE_ENFORCE(input->isNetworkInput());
TensorRTEngine::SetITensor(name, input); TensorRTEngine::SetITensor(name, input);
return input; return input;
...@@ -139,30 +143,40 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) { ...@@ -139,30 +143,40 @@ void* TensorRTEngine::GetOutputInGPU(const std::string& name) {
return buffer(name).buffer; return buffer(name).buffer;
} }
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst, void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst) {
size_t max_size) {
// determine data size // determine data size
auto* output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name); auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second); PADDLE_ENFORCE_LE(dst_size, it->second);
auto& buf = buffer(name); auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToDevice, *stream_), cudaMemcpyDeviceToDevice, *stream_),
0); 0);
} }
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst, void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst) {
size_t max_size) {
// determine data size // determine data size
auto* output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name); auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second); PADDLE_ENFORCE_LE(dst_size, it->second);
auto& buf = buffer(name); auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToHost, *stream_)); cudaMemcpyDeviceToHost, *stream_));
} }
...@@ -207,6 +221,12 @@ nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) { ...@@ -207,6 +221,12 @@ nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) {
return itensor_map_[name]; return itensor_map_[name];
} }
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
runtime_batch_ = batch_size;
}
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -104,10 +104,10 @@ class TensorRTEngine : public EngineBase { ...@@ -104,10 +104,10 @@ class TensorRTEngine : public EngineBase {
// Return the output's GPU memory address without copy. // Return the output's GPU memory address without copy.
void* GetOutputInGPU(const std::string& name); void* GetOutputInGPU(const std::string& name);
// Copy data into dst inside the GPU device. // Copy data into dst inside the GPU device.
void GetOutputInGPU(const std::string& name, void* dst, size_t max_size); void GetOutputInGPU(const std::string& name, void* dst);
// LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
// to CPU. // to CPU.
void GetOutputInCPU(const std::string& name, void* dst, size_t max_size); void GetOutputInCPU(const std::string& name, void* dst);
// Fill an ITensor into map itensor_map_. // Fill an ITensor into map itensor_map_.
void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
// Get an ITensor called name. // Get an ITensor called name.
...@@ -115,10 +115,14 @@ class TensorRTEngine : public EngineBase { ...@@ -115,10 +115,14 @@ class TensorRTEngine : public EngineBase {
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch();
private: private:
// the max batch size // the max batch size
int max_batch_; int max_batch_;
// the runtime batch size
static int runtime_batch_;
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
cudaStream_t* stream_; cudaStream_t* stream_;
......
...@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ASSERT_EQ(0, cudaStreamCreate(&stream_)); ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(1, 1 << 10, &stream_); engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
engine_->InitNetwork(); engine_->InitNetwork();
} }
...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu; float y_cpu;
engine_->GetOutputInCPU("y", &y_cpu, sizeof(float)); engine_->GetOutputInCPU("y", &y_cpu);
LOG(INFO) << "to checkout output"; LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu, x_v * 2 + 3); ASSERT_EQ(y_cpu, x_v * 2 + 3);
...@@ -103,11 +103,44 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -103,11 +103,44 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.}; float y_cpu[2] = {-1., -1.};
engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); engine_->GetOutputInCPU("y", &y_cpu[0]);
ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5); ASSERT_EQ(y_cpu[1], 14.5);
} }
TEST_F(TensorRTEngineTest, test_conv2d_temp) {
// Weight in CPU memory.
float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
float raw_bias[1] = {0};
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 3, 3});
auto* conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr);
conv_layer->setStride(nvinfer1::DimsHW{1, 1});
conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
engine_->DeclareOutput(conv_layer, 0, "y");
engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
18 * sizeof(float));
engine_->Execute(2);
LOG(INFO) << "to get output";
float* y_cpu = new float[18];
engine_->GetOutputInCPU("y", &y_cpu[0]);
ASSERT_EQ(y_cpu[0], 4.0);
ASSERT_EQ(y_cpu[1], 6.0);
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册