diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index d61d635ed707bc455d495f2420925a3585234b5c..91670ba8ac5332fe6e83b7bff14cb1a349d7e2a2 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -103,6 +103,7 @@ class OpConverter { void ConvertBlock(const framework::proto::BlockDesc& block, const std::unordered_set& parameters, const framework::Scope& scope, TensorRTEngine* engine) { + std::unique_lock lk(mut_); for (int i = 0; i < block.ops_size(); i++) { const auto& op = block.ops(i); ConvertOp(op, parameters, scope, engine); @@ -125,6 +126,7 @@ class OpConverter { std::unordered_map converters_; // fluid inference scope framework::Scope* scope_{nullptr}; + std::mutex mut_; }; } // namespace tensorrt diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt index eee0b90fbae216e804e62993313796e914fcef5a..6b551d13f1dc5cd1c82a15a8347b278e8f795c1c 100644 --- a/paddle/fluid/operators/tensorrt/CMakeLists.txt +++ b/paddle/fluid/operators/tensorrt/CMakeLists.txt @@ -1,5 +1,5 @@ op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) -file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(tensorrt_engine);\n") +file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(tensorrt_engine);\n") nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc DEPS tensorrt_engine_op analysis) diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 3cf2ce3c7ef87dcf75548f7d9c3a55d06ed765e8..f1ab59e3972efe598472b2d18077b15ce88716c4 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -21,8 +21,6 @@ namespace paddle { -DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); - namespace operators { class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { @@ -50,6 +48,6 @@ class TensorRTEngineInferVarType : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR(tensorrt_engine, ops::TensorRTEngineOp, - ops::TensorRTEngineOpMaker, ops::TensorRTEngineOpMaker); + ops::TensorRTEngineOpMaker); #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc deleted file mode 100644 index cbe1b426f65386e722a7b02ec1fdfdf75bfd770c..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cu.cc +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" - -namespace ops = paddle::operators; - -REGISTER_OP_CUDA_KERNEL( - tensorrt_engine, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel, - ops::TensorRTEngineKernel); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 6eef4c98c48af014f8e19fde93aaa9fbb6903867..c19c315f798316d78d36d5fce788faf4abd0686d 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -27,8 +27,6 @@ namespace paddle { -DECLARE_int32(tensorrt_engine_batch_size); - namespace operators { using FluidDT = framework::proto::VarType_Type; @@ -49,7 +47,7 @@ TRT_DT FluidDataType2TRT(FluidDT type) { return TRT_DT::kINT32; } -nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { +nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { PADDLE_ENFORCE_GT(shape.size(), 1UL, "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, @@ -63,131 +61,121 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { } // namespace // NOLINT using inference::Singleton; -using inference::tensorrt::TRT_EngineManager; +using inference::tensorrt::TensorRTEngine; + +class TensorRTEngineOp : public framework::OperatorBase { + private: + std::string engine_name_; + std::vector input_names_; + std::unordered_set param_names_; + mutable std::unique_ptr trt_engine_; + int max_batch_size_; + int workspace_size_; -class TensorRTEngineOp : public framework::OperatorWithKernel { public: - using framework::OperatorWithKernel::OperatorWithKernel; + TensorRTEngineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) { + engine_name_ = Attr("engine_uniq_key"); + input_names_ = Inputs("Xs"); + max_batch_size_ = Attr("max_batch_size"); + workspace_size_ = Attr("workspace_size"); + + auto params = Attr>("parameters"); + for (const auto ¶m : params) { + param_names_.insert(param); + } + } protected: - void InferShape(framework::InferShapeContext* ctx) const override {} - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const override { - auto input0 = ctx.Inputs("Xs").front(); - framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType(ctx.scope() - .FindVar(input0) - ->GetMutable() - ->type()), - ctx.GetPlace()); - return kt; + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunTrt(scope, dev_place); } -}; -template -class TensorRTEngineKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto engine_name = context.Attr("engine_uniq_key"); - int max_batch_size = context.Attr("max_batch_size"); - if (!Singleton::Global().HasEngine(engine_name)) { - Prepare(context); + void RunTrt(const framework::Scope &scope, + const platform::Place &dev_place) const { + int runtime_batch = 1; + if (trt_engine_.get() == nullptr) { + trt_engine_.reset(new TensorRTEngine( + max_batch_size_, workspace_size_, nullptr, + boost::get(dev_place).device)); + Prepare(scope, dev_place, trt_engine_.get()); } - auto* engine = Singleton::Global().Get(engine_name); - auto input_names = context.op().Inputs("Xs"); - PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, max_batch_size); + + auto *engine = trt_engine_.get(); + PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = - context.Attr>("output_name_mapping"); + Attr>("output_name_mapping"); - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto& param : params) { - parameters.insert(param); - } // Convert input tensor from fluid to engine. - for (const auto& x : context.Inputs("Xs")) { - if (parameters.count(x)) continue; + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; // convert input and copy to TRT engine's buffer - auto& t = inference::analysis::GetFromScope( - context.scope(), x); + auto &t = + inference::analysis::GetFromScope(scope, x); + auto t_shape = framework::vectorize(t.dims()); + runtime_batch = t_shape[0]; if (platform::is_cpu_place(t.place())) { - engine->SetInputFromCPU(x, static_cast(t.data()), + engine->SetInputFromCPU(x, static_cast(t.data()), t.memory_size()); } else { - engine->SetInputFromGPU(x, static_cast(t.data()), + engine->SetInputFromGPU(x, static_cast(t.data()), t.memory_size()); } } + + PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); // Execute the engine. - PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); - engine->Execute(FLAGS_tensorrt_engine_batch_size); + engine->Execute(runtime_batch); // Convert output tensor from engine to fluid int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; - for (const auto& y : context.Outputs("Ys")) { + for (const auto &y : Outputs("Ys")) { VLOG(4) << y; // convert output and copy to fluid. - nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); + nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; - ddim.push_back(FLAGS_tensorrt_engine_batch_size); + ddim.push_back(runtime_batch); for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } - auto* fluid_v = context.scope().FindVar(y); + auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); - auto* fluid_t = fluid_v->GetMutable(); + auto *fluid_t = fluid_v->GetMutable(); fluid_t->Resize(framework::make_ddim(ddim)); - // TODO(Superjomn) find some way to determine which device to output the - // tensor. - // if (platform::is_cpu_place(fluid_t->place())) { // TODO(Superjomn) change this float to dtype size. - auto size = inference::analysis::AccuDims(dims.d, dims.nbDims) * - FLAGS_tensorrt_engine_batch_size; + auto size = + inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; engine->GetOutputInGPU( output_maps[output_index], fluid_t->mutable_data(platform::CUDAPlace( - boost::get(context.GetPlace()).device)), + boost::get(dev_place).device)), size * sizeof(float)); - output_index += 1; } cudaStreamSynchronize(*engine->stream()); } - protected: - void Prepare(const framework::ExecutionContext& context) const { + void Prepare(const framework::Scope &scope, const platform::Place &dev_place, + TensorRTEngine *engine) const { VLOG(4) << "Prepare engine"; - // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch_size = context.Attr("max_batch_size"); - int workspace_size = context.Attr("workspace_size"); - - auto params = context.Attr>("parameters"); - std::unordered_set parameters; - for (const auto& param : params) { - parameters.insert(param); - } + block_desc.ParseFromString(Attr("subgraph")); std::vector output_maps = - context.Attr>("output_name_mapping"); - - // TODO(Superjomn) replace this with a different stream - auto* engine = Singleton::Global().Create( - max_batch_size, workspace_size, nullptr /*engine hold its own stream*/, - context.Attr("engine_uniq_key"), - boost::get(context.GetPlace()).device); + Attr>("output_name_mapping"); engine->InitNetwork(); @@ -195,39 +183,33 @@ class TensorRTEngineKernel : public framework::OpKernel { VLOG(4) << "parsed var size " << block.AllVars().size(); // Add inputs VLOG(4) << "declare inputs"; - for (auto& input : context.Inputs("Xs")) { - if (parameters.count(input)) continue; + for (auto &input : Inputs("Xs")) { + if (param_names_.count(input)) continue; VLOG(4) << "declare input " << input; - auto* var = block.FindVar(input); + + auto &t = + inference::analysis::GetFromScope(scope, input); + auto t_shape = framework::vectorize(t.dims()); + + auto *var = block.FindVar(input); // TensorRT engine need to create parameters. The parameter's description // should be set in PADDLE_ENFORCE(var, "no variable called %s", input); PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, "TensorRT engine only takes LoDTensor as input"); - auto shape = var->GetShape(); - // For the special batch_size placeholder -1, drop it and pass the real - // shape of data. - // TODO(Superjomn) fix this with batch broadcast, or it can't handle - // variational batch size. - if (shape[0] == -1) { - shape[0] = FLAGS_tensorrt_engine_batch_size; - } + engine->DeclareInput( input, FluidDataType2TRT( var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(shape)); + Vec2TRT_Dims(t_shape)); } - inference::Singleton::Global() - .ConvertBlock(block_desc, parameters, context.scope(), engine); + .ConvertBlock(block_desc, param_names_, scope, engine); // Add outputs - for (auto& output : output_maps) { - if (!engine->HasDeclared(output)) { - engine->DeclareOutput(output); - } + for (auto &output : output_maps) { + engine->DeclareOutput(output); } - engine->FreezeNetwork(); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 56bdd6c2f2801967829f2baf889b5517a1d9d8d9..6f8adb00edd7490a9cd547c16b0d42e7e6532c98 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -24,8 +24,6 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" -USE_CUDA_ONLY_OP(tensorrt_engine); - namespace paddle { namespace operators {