未验证 提交 6169d724 编写于 作者: Z Zhaolong Xing 提交者: GitHub

Merge pull request #12324 from NHZlX/enhance_for_tensorrt_infer

Enhance for tensorrt infer
...@@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides, ...@@ -32,11 +32,11 @@ void Reorder2(nvinfer1::DimsHW shape, const T* idata, nvinfer1::DimsHW istrides,
for (int h = 0; h < shape.h(); ++h) { for (int h = 0; h < shape.h(); ++h) {
for (int w = 0; w < shape.w(); ++w) { for (int w = 0; w < shape.w(); ++w) {
odata[h * ostrides.h() + w * ostrides.w()] = odata[h * ostrides.h() + w * ostrides.w()] =
idata[h * ostrides.h() + w * ostrides.w()]; idata[h * istrides.h() + w * istrides.w()];
} }
} }
} }
// indata c * k
// Reorder the data layout from CK to KC. // Reorder the data layout from CK to KC.
void ReorderCKtoKC(TensorRTEngine::Weight& iweights, void ReorderCKtoKC(TensorRTEngine::Weight& iweights,
TensorRTEngine::Weight* oweights) { TensorRTEngine::Weight* oweights) {
...@@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter { ...@@ -79,9 +79,8 @@ class FcOpConverter : public OpConverter {
framework::LoDTensor tmp; framework::LoDTensor tmp;
tmp.Resize(Y_t->dims()); tmp.Resize(Y_t->dims());
memcpy(tmp.mutable_data<float>(platform::CPUPlace()), Y_t->data<float>(), memcpy(tmp.mutable_data<float>(platform::CPUPlace()), weight_data,
Y_t->dims()[0] * Y_t->dims()[1]); Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
static_cast<void*>(weight_data), static_cast<void*>(weight_data),
Y_t->memory_size() / sizeof(float)}; Y_t->memory_size() / sizeof(float)};
...@@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter { ...@@ -93,7 +92,7 @@ class FcOpConverter : public OpConverter {
// The data layout of TRT FC layer's weight is different from fluid's FC, // The data layout of TRT FC layer's weight is different from fluid's FC,
// need to reorder the elements. // need to reorder the elements.
ReorderCKtoKC(tmp_weight, &weight); ReorderCKtoKC(weight, &tmp_weight);
// Currently, the framework can only handle one fluid op -> one TRT layer, // Currently, the framework can only handle one fluid op -> one TRT layer,
// but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just // but fc fuses `mul` and `bias` (2 fluid ops), so here is a trick, just
...@@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter { ...@@ -103,7 +102,7 @@ class FcOpConverter : public OpConverter {
auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, auto* layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected,
*const_cast<nvinfer1::ITensor*>(X), *const_cast<nvinfer1::ITensor*>(X),
n_output, weight.get(), bias.get()); n_output, tmp_weight.get(), bias.get());
auto output_name = op_desc.Output("Out").front(); auto output_name = op_desc.Output("Out").front();
engine_->SetITensor(output_name, layer->getOutput(0)); engine_->SetITensor(output_name, layer->getOutput(0));
...@@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter { ...@@ -118,4 +117,3 @@ class FcOpConverter : public OpConverter {
} // namespace paddle } // namespace paddle
REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter); REGISTER_TRT_OP_CONVERTER(fc, FcOpConverter);
USE_OP(mul);
...@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) { ...@@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
LOG(INFO) << "execute"; LOG(INFO) << "execute";
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -23,11 +23,11 @@ namespace tensorrt { ...@@ -23,11 +23,11 @@ namespace tensorrt {
TEST(fc_op, test) { TEST(fc_op, test) {
std::unordered_set<std::string> parameters({"mul-Y"}); std::unordered_set<std::string> parameters({"mul-Y"});
framework::Scope scope; framework::Scope scope;
TRTConvertValidation validator(20, parameters, scope, 1000); TRTConvertValidation validator(10, parameters, scope, 1000);
validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1));
validator.DeclInputVar("mul-X", nvinfer1::Dims4(8, 3, 1, 1)); validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2));
validator.DeclParamVar("mul-Y", nvinfer1::Dims2(3, 2)); // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2));
validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(8, 2)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2));
// Prepare Op description // Prepare Op description
framework::OpDesc desc; framework::OpDesc desc;
...@@ -38,9 +38,10 @@ TEST(fc_op, test) { ...@@ -38,9 +38,10 @@ TEST(fc_op, test) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
USE_OP(mul);
...@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) { ...@@ -39,7 +39,7 @@ TEST(MulOpConverter, main) {
validator.SetOp(*desc.Proto()); validator.SetOp(*desc.Proto());
LOG(INFO) << "execute"; LOG(INFO) << "execute";
validator.Execute(10); validator.Execute(1);
} }
} // namespace tensorrt } // namespace tensorrt
......
...@@ -39,7 +39,7 @@ namespace tensorrt { ...@@ -39,7 +39,7 @@ namespace tensorrt {
float random(float low, float high) { float random(float low, float high) {
static std::random_device rd; static std::random_device rd;
static std::mt19937 mt(rd()); static std::mt19937 mt(rd());
std::uniform_real_distribution<double> dist(1.0, 10.0); std::uniform_real_distribution<double> dist(low, high);
return dist(mt); return dist(mt);
} }
...@@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place, ...@@ -49,6 +49,7 @@ void RandomizeTensor(framework::LoDTensor* tensor, const platform::Place& place,
size_t num_elements = analysis::AccuDims(dims, dims.size()); size_t num_elements = analysis::AccuDims(dims, dims.size());
PADDLE_ENFORCE_GT(num_elements, 0); PADDLE_ENFORCE_GT(num_elements, 0);
auto* data = tensor->mutable_data<float>(place); auto* data = tensor->mutable_data<float>(place);
for (size_t i = 0; i < num_elements; i++) { for (size_t i = 0; i < num_elements; i++) {
*(data + i) = random(0., 1.); *(data + i) = random(0., 1.);
} }
...@@ -68,7 +69,7 @@ class TRTConvertValidation { ...@@ -68,7 +69,7 @@ class TRTConvertValidation {
int workspace_size = 1 << 10) int workspace_size = 1 << 10)
: parameters_(parameters), scope_(scope) { : parameters_(parameters), scope_(scope) {
// create engine. // create engine.
engine_.reset(new TensorRTEngine(10, 1 << 10, &stream_)); engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_));
engine_->InitNetwork(); engine_->InitNetwork();
PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
...@@ -138,12 +139,11 @@ class TRTConvertValidation { ...@@ -138,12 +139,11 @@ class TRTConvertValidation {
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
const size_t output_space_size = 200; const size_t output_space_size = 2000;
for (const auto& output : op_desc_->OutputArgumentNames()) { for (const auto& output : op_desc_->OutputArgumentNames()) {
std::vector<float> fluid_out; std::vector<float> fluid_out;
std::vector<float> trt_out(output_space_size); std::vector<float> trt_out(output_space_size);
engine_->GetOutputInCPU(output, &trt_out[0], engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
output_space_size * sizeof(float));
cudaStreamSynchronize(*engine_->stream()); cudaStreamSynchronize(*engine_->stream());
auto* var = scope_.FindVar(output); auto* var = scope_.FindVar(output);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License"); you may not use
you may not use this file except in compliance with the License. this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
...@@ -26,6 +26,8 @@ namespace paddle { ...@@ -26,6 +26,8 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
int TensorRTEngine::runtime_batch_ = 1;
void TensorRTEngine::Build(const DescType &paddle_model) { void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE(false, "not implemented"); PADDLE_ENFORCE(false, "not implemented");
} }
...@@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) { ...@@ -42,6 +44,7 @@ void TensorRTEngine::Execute(int batch_size) {
PADDLE_ENFORCE_NOT_NULL(stream_); PADDLE_ENFORCE_NOT_NULL(stream_);
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
cudaStreamSynchronize(*stream_); cudaStreamSynchronize(*stream_);
SetRuntimeBatch(batch_size);
} }
TensorRTEngine::~TensorRTEngine() { TensorRTEngine::~TensorRTEngine() {
...@@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -80,17 +83,17 @@ void TensorRTEngine::FreezeNetwork() {
auto dims = infer_engine_->getBindingDimensions(slot_offset); auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>( item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] * infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE_GT(item.second, 0); PADDLE_ENFORCE_GT(item.second, 0);
} }
auto &buf = buffer(item.first); auto &buf = buffer(item.first);
buf.max_size = item.second * max_batch_; buf.max_size = item.second * max_batch_;
CHECK(buf.buffer == nullptr); // buffer should be allocated only once. CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
// buf.size will changed in the runtime.
buf.size = 0; buf.size = 0;
PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
buf.device = DeviceType::GPU; buf.device = DeviceType::GPU;
} }
} }
...@@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, ...@@ -105,7 +108,7 @@ nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
auto *input = infer_network_->addInput(name.c_str(), dtype, dims); auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name); PADDLE_ENFORCE(input, "infer network add input %s failed", name);
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
PADDLE_ENFORCE(input->isNetworkInput()); PADDLE_ENFORCE(input->isNetworkInput());
TensorRTEngine::SetITensor(name, input); TensorRTEngine::SetITensor(name, input);
return input; return input;
...@@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) { ...@@ -149,35 +152,42 @@ void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
size_t max_size) { size_t max_size) {
// determine data size // determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name); auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second); PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
cudaMemcpyDeviceToDevice, *stream_), cudaMemcpyDeviceToDevice, *stream_),
0); 0);
} }
void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
size_t max_size) { size_t max_size) {
VLOG(4) << "get output in cpu";
auto &buf = buffer(name);
// Update needed buffer size.
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
buf.size = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims);
PADDLE_ENFORCE_LE(buf.size, buf.max_size);
// determine data size // determine data size
auto *output = TensorRTEngine::GetITensor(name);
nvinfer1::Dims dims = output->getDimensions();
auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
size_t dst_size = dim_size * runtime_batch_ *
kDataTypeSize[static_cast<int>(output->getType())];
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_LE(dst_size, it->second);
PADDLE_ENFORCE_GE(max_size, dst_size);
auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
// DEBUG PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
memset(dst, 0, buf.size); cudaMemcpyDeviceToHost, *stream_));
PADDLE_ENFORCE_EQ(
0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
} }
Buffer &TensorRTEngine::buffer(const std::string &name) { Buffer &TensorRTEngine::buffer(const std::string &name) {
...@@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { ...@@ -225,6 +235,12 @@ nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
return itensor_map_[name]; return itensor_map_[name];
} }
void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
runtime_batch_ = batch_size;
}
int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase { ...@@ -117,10 +117,14 @@ class TensorRTEngine : public EngineBase {
nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
void SetRuntimeBatch(size_t batch_size);
int GetRuntimeBatch();
private: private:
// the max batch size // the max batch size
int max_batch_; int max_batch_;
// the runtime batch size
static int runtime_batch_;
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
......
...@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test { ...@@ -28,7 +28,7 @@ class TensorRTEngineTest : public ::testing::Test {
protected: protected:
void SetUp() override { void SetUp() override {
ASSERT_EQ(0, cudaStreamCreate(&stream_)); ASSERT_EQ(0, cudaStreamCreate(&stream_));
engine_ = new TensorRTEngine(1, 1 << 10, &stream_); engine_ = new TensorRTEngine(10, 1 << 10, &stream_);
engine_->InitNetwork(); engine_->InitNetwork();
} }
...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -71,7 +71,7 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu; float y_cpu;
engine_->GetOutputInCPU("y", &y_cpu, sizeof(float)); engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
LOG(INFO) << "to checkout output"; LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu, x_v * 2 + 3); ASSERT_EQ(y_cpu, x_v * 2 + 3);
...@@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -103,15 +103,49 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.}; float y_cpu[2] = {-1., -1.};
auto dims = engine_->GetITensor("y")->getDimensions(); auto dims = engine_->GetITensor("y")->getDimensions();
ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1); ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5); ASSERT_EQ(y_cpu[1], 14.5);
} }
TEST_F(TensorRTEngineTest, test_conv2d_temp) {
// Weight in CPU memory.
float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
float raw_bias[1] = {0};
TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
nvinfer1::Dims3{1, 3, 3});
auto* conv_layer =
TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
weight.get(), bias.get());
PADDLE_ENFORCE(conv_layer != nullptr);
conv_layer->setStride(nvinfer1::DimsHW{1, 1});
conv_layer->setPadding(nvinfer1::DimsHW{1, 1});
engine_->DeclareOutput(conv_layer, 0, "y");
engine_->FreezeNetwork();
ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
18 * sizeof(float));
engine_->Execute(2);
LOG(INFO) << "to get output";
float* y_cpu = new float[18];
engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
ASSERT_EQ(y_cpu[0], 4.0);
ASSERT_EQ(y_cpu[1], 6.0);
}
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) { ...@@ -55,13 +55,14 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
"TensorRT' tensor input requires at least 2 dimensions"); "TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL, PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions"); "TensorRT' tensor input requires at most 4 dimensions");
switch (shape.size()) { switch (shape.size()) {
case 2: case 2:
return nvinfer1::Dims2(shape[0], shape[1]); return nvinfer1::Dims2(1, shape[1]);
case 3: case 3:
return nvinfer1::Dims3(shape[0], shape[1], shape[2]); return nvinfer1::Dims3(1, shape[1], shape[2]);
case 4: case 4:
return nvinfer1::Dims4(shape[0], shape[1], shape[2], shape[3]); return nvinfer1::Dims4(1, shape[1], shape[2], shape[3]);
default: default:
return nvinfer1::Dims(); return nvinfer1::Dims();
} }
......
...@@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -93,13 +93,15 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* fluid_v = context.scope().FindVar(y); auto* fluid_v = context.scope().FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>(); auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
fluid_t->Resize(framework::make_ddim(ddim)); fluid_t->Resize(framework::make_ddim(ddim));
// TODO(Superjomn) find some way to determine which device to output the // TODO(Superjomn) find some way to determine which device to output the
// tensor. // tensor.
// if (platform::is_cpu_place(fluid_t->place())) { // if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size. // TODO(Superjomn) change this float to dtype size.
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) *
FLAGS_tensorrt_engine_batch_size;
engine->GetOutputInCPU(y, engine->GetOutputInCPU(y,
fluid_t->mutable_data<float>(platform::CPUPlace()), fluid_t->mutable_data<float>(platform::CPUPlace()),
size * sizeof(float)); size * sizeof(float));
......
...@@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) { ...@@ -64,36 +64,37 @@ TEST(TensorRTEngineOp, manual) {
LOG(INFO) << "create block desc"; LOG(INFO) << "create block desc";
framework::BlockDesc block_desc(&program, block_); framework::BlockDesc block_desc(&program, block_);
LOG(INFO) << "create mul op"; LOG(INFO) << "create fc op";
auto* mul = block_desc.AppendOp(); auto* fc0 = block_desc.AppendOp();
mul->SetType("mul"); fc0->SetType("fc");
mul->SetInput("X", std::vector<std::string>({"x"})); // 2 x 4 fc0->SetInput("X", std::vector<std::string>({"x"})); // 4 x 1 x 1
mul->SetInput("Y", std::vector<std::string>({"y"})); // 4 x 6 fc0->SetInput("Y", std::vector<std::string>({"y"})); // 4 x 6
mul->SetOutput("Out", std::vector<std::string>({"z"})); // 2 x 6 fc0->SetOutput("Out", std::vector<std::string>({"z"})); // 6 x 1 x 1
LOG(INFO) << "create fc op"; LOG(INFO) << "create fc op";
auto* fc = block_desc.AppendOp(); auto* fc1 = block_desc.AppendOp();
fc->SetType("mul"); fc1->SetType("fc");
fc->SetInput("X", std::vector<std::string>({"z"})); fc1->SetInput("X", std::vector<std::string>({"z"}));
fc->SetInput("Y", std::vector<std::string>({"y0"})); // 6 x 8 fc1->SetInput("Y", std::vector<std::string>({"y0"})); // 6 x 8
fc->SetOutput("Out", std::vector<std::string>({"z0"})); // 2 x 8 fc1->SetOutput("Out", std::vector<std::string>({"z0"})); // 8 x 1 x 1
// Set inputs' variable shape in BlockDesc // Set inputs' variable shape in BlockDesc
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4})); // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1}
AddTensorToBlockDesc(block_, "x", std::vector<int64_t>({2, 4, 1, 1}));
AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6})); AddTensorToBlockDesc(block_, "y", std::vector<int64_t>({4, 6}));
AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8})); AddTensorToBlockDesc(block_, "y0", std::vector<int64_t>({6, 8}));
AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6})); AddTensorToBlockDesc(block_, "z", std::vector<int64_t>({2, 6}));
// It is wired, need to copy manually. // It is wired, need to copy manually.
*block_->add_ops() = *mul->Proto(); *block_->add_ops() = *fc0->Proto();
*block_->add_ops() = *fc->Proto(); *block_->add_ops() = *fc1->Proto();
ASSERT_EQ(block_->ops_size(), 2); ASSERT_EQ(block_->ops_size(), 2);
LOG(INFO) << "create tensorrt desc"; LOG(INFO) << "create tensorrt desc";
framework::OpDesc engine_op_desc(nullptr); framework::OpDesc engine_op_desc(nullptr);
engine_op_desc.SetType("tensorrt_engine"); engine_op_desc.SetType("tensorrt_engine");
engine_op_desc.SetInput("Xs", std::vector<std::string>({"x", "y", "y0"})); engine_op_desc.SetInput("Xs", std::vector<std::string>({"x"}));
engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"})); engine_op_desc.SetOutput("Ys", std::vector<std::string>({"z0"}));
SetAttr<std::string>(engine_op_desc.Proto(), "subgraph", SetAttr<std::string>(engine_op_desc.Proto(), "subgraph",
block_->SerializeAsString()); block_->SerializeAsString());
...@@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); } ...@@ -207,5 +208,4 @@ TEST(TensorRTEngineOp, fc) { Execute(40, 28, 28); }
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
USE_TRT_CONVERTER(mul)
USE_TRT_CONVERTER(fc) USE_TRT_CONVERTER(fc)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册