提交 96216052 编写于 作者: N nhzlx

1. fix trt multi thread bug

上级 30aad884
...@@ -103,6 +103,7 @@ class OpConverter { ...@@ -103,6 +103,7 @@ class OpConverter {
void ConvertBlock(const framework::proto::BlockDesc& block, void ConvertBlock(const framework::proto::BlockDesc& block,
const std::unordered_set<std::string>& parameters, const std::unordered_set<std::string>& parameters,
const framework::Scope& scope, TensorRTEngine* engine) { const framework::Scope& scope, TensorRTEngine* engine) {
std::unique_lock<std::mutex> lk(mut_);
for (int i = 0; i < block.ops_size(); i++) { for (int i = 0; i < block.ops_size(); i++) {
const auto& op = block.ops(i); const auto& op = block.ops(i);
ConvertOp(op, parameters, scope, engine); ConvertOp(op, parameters, scope, engine);
...@@ -125,6 +126,7 @@ class OpConverter { ...@@ -125,6 +126,7 @@ class OpConverter {
std::unordered_map<std::string, OpConverter*> converters_; std::unordered_map<std::string, OpConverter*> converters_;
// fluid inference scope // fluid inference scope
framework::Scope* scope_{nullptr}; framework::Scope* scope_{nullptr};
std::mutex mut_;
}; };
} // namespace tensorrt } // namespace tensorrt
......
op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n") file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n")
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op DEPS tensorrt_engine_op
analysis) analysis)
...@@ -21,8 +21,6 @@ ...@@ -21,8 +21,6 @@
namespace paddle { namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
namespace operators { namespace operators {
class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -50,6 +48,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { ...@@ -50,6 +48,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp,
ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); ops::TensorRTEngineOpMaker);
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
tensorrt_engine,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, float>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, double>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CUDADeviceContext, int64_t>);
...@@ -27,8 +27,6 @@ ...@@ -27,8 +27,6 @@
namespace paddle { namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size);
namespace operators { namespace operators {
using FluidDT = framework::proto::VarType_Type; using FluidDT = framework::proto::VarType_Type;
...@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) { ...@@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
return TRT_DT::kINT32; return TRT_DT::kINT32;
} }
nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) { nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
PADDLE_ENFORCE_GT(shape.size(), 1UL, PADDLE_ENFORCE_GT(shape.size(), 1UL,
"TensorRT' tensor input requires at least 2 dimensions"); "TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL, PADDLE_ENFORCE_LE(shape.size(), 4UL,
...@@ -63,131 +61,121 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) { ...@@ -63,131 +61,121 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
} // namespace // NOLINT } // namespace // NOLINT
using inference::Singleton; using inference::Singleton;
using inference::tensorrt::TRT_EngineManager; using inference::tensorrt::TensorRTEngine;
class TensorRTEngineOp : public framework::OperatorBase {
private:
std::string engine_name_;
std::vector<std::string> input_names_;
std::unordered_set<std::string> param_names_;
mutable std::unique_ptr<TensorRTEngine> trt_engine_;
int max_batch_size_;
int workspace_size_;
class TensorRTEngineOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; TensorRTEngineOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: framework::OperatorBase(type, inputs, outputs, attrs) {
engine_name_ = Attr<std::string>("engine_uniq_key");
input_names_ = Inputs("Xs");
max_batch_size_ = Attr<int>("max_batch_size");
workspace_size_ = Attr<int>("workspace_size");
auto params = Attr<std::vector<std::string>>("parameters");
for (const auto &param : params) {
param_names_.insert(param);
}
}
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override {} void RunImpl(const framework::Scope &scope,
const platform::Place &dev_place) const override {
framework::OpKernelType GetExpectedKernelType( RunTrt(scope, dev_place);
const framework::ExecutionContext& ctx) const override {
auto input0 = ctx.Inputs("Xs").front();
framework::OpKernelType kt = framework::OpKernelType(
framework::ToDataType(ctx.scope()
.FindVar(input0)
->GetMutable<framework::LoDTensor>()
->type()),
ctx.GetPlace());
return kt;
} }
};
template <typename DeviceContext, typename T> void RunTrt(const framework::Scope &scope,
class TensorRTEngineKernel : public framework::OpKernel<T> { const platform::Place &dev_place) const {
public: int runtime_batch = 1;
void Compute(const framework::ExecutionContext& context) const override { if (trt_engine_.get() == nullptr) {
auto engine_name = context.Attr<std::string>("engine_uniq_key"); trt_engine_.reset(new TensorRTEngine(
int max_batch_size = context.Attr<int>("max_batch_size"); max_batch_size_, workspace_size_, nullptr,
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) { boost::get<platform::CUDAPlace>(dev_place).device));
Prepare(context); Prepare(scope, dev_place, trt_engine_.get());
} }
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
auto input_names = context.op().Inputs("Xs"); auto *engine = trt_engine_.get();
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size);
std::vector<std::string> output_maps = std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping"); Attr<std::vector<std::string>>("output_name_mapping");
auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters;
for (const auto& param : params) {
parameters.insert(param);
}
// Convert input tensor from fluid to engine. // Convert input tensor from fluid to engine.
for (const auto& x : context.Inputs("Xs")) { for (const auto &x : Inputs("Xs")) {
if (parameters.count(x)) continue; if (param_names_.count(x)) continue;
// convert input and copy to TRT engine's buffer // convert input and copy to TRT engine's buffer
auto& t = inference::analysis::GetFromScope<framework::LoDTensor>( auto &t =
context.scope(), x); inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
auto t_shape = framework::vectorize(t.dims());
runtime_batch = t_shape[0];
if (platform::is_cpu_place(t.place())) { if (platform::is_cpu_place(t.place())) {
engine->SetInputFromCPU(x, static_cast<const void*>(t.data<void>()), engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
t.memory_size()); t.memory_size());
} else { } else {
engine->SetInputFromGPU(x, static_cast<const void*>(t.data<void>()), engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
t.memory_size()); t.memory_size());
} }
} }
PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
// Execute the engine. // Execute the engine.
PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); engine->Execute(runtime_batch);
engine->Execute(FLAGS_tensorrt_engine_batch_size);
// Convert output tensor from engine to fluid // Convert output tensor from engine to fluid
int output_index = 0; int output_index = 0;
VLOG(4) << "TensorRT Engine Op Outputs:"; VLOG(4) << "TensorRT Engine Op Outputs:";
for (const auto& y : context.Outputs("Ys")) { for (const auto &y : Outputs("Ys")) {
VLOG(4) << y; VLOG(4) << y;
// convert output and copy to fluid. // convert output and copy to fluid.
nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
auto dims = trt_t->getDimensions(); auto dims = trt_t->getDimensions();
// Use the output ITensor's dims to reshape the Fluid Tensor. // Use the output ITensor's dims to reshape the Fluid Tensor.
// The ITensor doesn't contain the batch size dim. // The ITensor doesn't contain the batch size dim.
std::vector<int> ddim; std::vector<int> ddim;
ddim.push_back(FLAGS_tensorrt_engine_batch_size); ddim.push_back(runtime_batch);
for (int i = 0; i < dims.nbDims; i++) { for (int i = 0; i < dims.nbDims; i++) {
ddim.push_back(dims.d[i]); ddim.push_back(dims.d[i]);
} }
auto* fluid_v = context.scope().FindVar(y); auto *fluid_v = scope.FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>(); auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
fluid_t->Resize(framework::make_ddim(ddim)); fluid_t->Resize(framework::make_ddim(ddim));
// TODO(Superjomn) find some way to determine which device to output the
// tensor.
// if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size. // TODO(Superjomn) change this float to dtype size.
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * auto size =
FLAGS_tensorrt_engine_batch_size; inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch;
engine->GetOutputInGPU( engine->GetOutputInGPU(
output_maps[output_index], output_maps[output_index],
fluid_t->mutable_data<float>(platform::CUDAPlace( fluid_t->mutable_data<float>(platform::CUDAPlace(
boost::get<platform::CUDAPlace>(context.GetPlace()).device)), boost::get<platform::CUDAPlace>(dev_place).device)),
size * sizeof(float)); size * sizeof(float));
output_index += 1; output_index += 1;
} }
cudaStreamSynchronize(*engine->stream()); cudaStreamSynchronize(*engine->stream());
} }
protected: void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
void Prepare(const framework::ExecutionContext& context) const { TensorRTEngine *engine) const {
VLOG(4) << "Prepare engine"; VLOG(4) << "Prepare engine";
// Get the ProgramDesc and pass to convert.
framework::proto::BlockDesc block_desc; framework::proto::BlockDesc block_desc;
block_desc.ParseFromString(context.Attr<std::string>("subgraph")); block_desc.ParseFromString(Attr<std::string>("subgraph"));
int max_batch_size = context.Attr<int>("max_batch_size");
int workspace_size = context.Attr<int>("workspace_size");
auto params = context.Attr<std::vector<std::string>>("parameters");
std::unordered_set<std::string> parameters;
for (const auto& param : params) {
parameters.insert(param);
}
std::vector<std::string> output_maps = std::vector<std::string> output_maps =
context.Attr<std::vector<std::string>>("output_name_mapping"); Attr<std::vector<std::string>>("output_name_mapping");
// TODO(Superjomn) replace this with a different stream
auto* engine = Singleton<TRT_EngineManager>::Global().Create(
max_batch_size, workspace_size, nullptr /*engine hold its own stream*/,
context.Attr<std::string>("engine_uniq_key"),
boost::get<platform::CUDAPlace>(context.GetPlace()).device);
engine->InitNetwork(); engine->InitNetwork();
...@@ -195,39 +183,33 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -195,39 +183,33 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
VLOG(4) << "parsed var size " << block.AllVars().size(); VLOG(4) << "parsed var size " << block.AllVars().size();
// Add inputs // Add inputs
VLOG(4) << "declare inputs"; VLOG(4) << "declare inputs";
for (auto& input : context.Inputs("Xs")) { for (auto &input : Inputs("Xs")) {
if (parameters.count(input)) continue; if (param_names_.count(input)) continue;
VLOG(4) << "declare input " << input; VLOG(4) << "declare input " << input;
auto* var = block.FindVar(input);
auto &t =
inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
auto t_shape = framework::vectorize(t.dims());
auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description // TensorRT engine need to create parameters. The parameter's description
// should be set in // should be set in
PADDLE_ENFORCE(var, "no variable called %s", input); PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input"); "TensorRT engine only takes LoDTensor as input");
auto shape = var->GetShape();
// For the special batch_size placeholder -1, drop it and pass the real
// shape of data.
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
// variational batch size.
if (shape[0] == -1) {
shape[0] = FLAGS_tensorrt_engine_batch_size;
}
engine->DeclareInput( engine->DeclareInput(
input, FluidDataType2TRT( input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()), var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(shape)); Vec2TRT_Dims(t_shape));
} }
inference::Singleton<inference::tensorrt::OpConverter>::Global() inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlock(block_desc, parameters, context.scope(), engine); .ConvertBlock(block_desc, param_names_, scope, engine);
// Add outputs // Add outputs
for (auto& output : output_maps) { for (auto &output : output_maps) {
if (!engine->HasDeclared(output)) { engine->DeclareOutput(output);
engine->DeclareOutput(output);
}
} }
engine->FreezeNetwork(); engine->FreezeNetwork();
} }
}; };
......
...@@ -24,8 +24,6 @@ limitations under the License. */ ...@@ -24,8 +24,6 @@ limitations under the License. */
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
USE_CUDA_ONLY_OP(tensorrt_engine);
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册