From d91e84ae0c7635ed1828b1e9e9961e04c524cafa Mon Sep 17 00:00:00 2001 From: nhzlx Date: Tue, 21 Aug 2018 11:19:28 +0000 Subject: [PATCH] fix ssa bug with batchnorm and refine the trt interface Merge pull request from #12843 from NHZlX:fix_ssa_bug_for_trt --- paddle/fluid/inference/analysis/analyzer.cc | 3 ++- .../analysis/data_flow_graph_to_fluid_pass.cc | 5 ----- .../analysis/data_flow_graph_to_fluid_pass.h | 3 --- .../analysis/fluid_to_data_flow_graph_pass.cc | 2 +- .../api/api_tensorrt_subgraph_engine.cc | 13 +++++++++++- .../inference/api/paddle_inference_api.h | 8 +++++++ paddle/fluid/operators/tensorrt_engine_op.cc | 4 ++-- paddle/fluid/operators/tensorrt_engine_op.h | 21 +++++++++++++------ .../operators/tensorrt_engine_op_test.cc | 7 +++++-- 9 files changed, 45 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 9318f1089..912615c94 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -44,7 +44,8 @@ class DfgPassManagerImpl final : public DfgPassManager { if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { auto trt_teller = [&](const Node* node) { std::unordered_set teller_set( - {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax"}); + {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax", + "depthwise_conv2d", "batch_norm"}); if (!node->IsFunction()) return false; const auto* func = static_cast(node); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index f40d471cb..ce0639a61 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -23,9 +23,6 @@ namespace paddle { namespace inference { -DEFINE_int32(tensorrt_max_batchsize, 1, "TensorRT maximum batch size"); -DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size"); - namespace analysis { using framework::proto::ProgramDesc; @@ -190,8 +187,6 @@ void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, // Set attrs SetAttr(desc.Proto(), "subgraph", block->SerializeAsString()); SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); - SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize); - SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "output_name_mapping", output_mapping); node->SetPbMsg(desc.Proto()->SerializeAsString()); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h index 59c47365a..0c9a8a0b7 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h @@ -27,9 +27,6 @@ namespace paddle { namespace inference { -DECLARE_int32(tensorrt_max_batchsize); -DECLARE_int32(tensorrt_workspace_size); - namespace analysis { class DataFlowGraphToFluidPass final : public DataFlowGraphPass { public: diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 511631d3e..16d82b5aa 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -92,6 +92,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); in->outlinks.push_back(o); o->inlinks.push_back(in); + unique_written_vars.insert(in); } } for (int j = 0; j < op.outputs_size(); j++) { @@ -112,7 +113,6 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { } out->inlinks.push_back(o); o->outlinks.push_back(out); - unique_written_vars.insert(out); } } } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 45b5a7638..9ac037297 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/operators/tensorrt_engine_op.h" @@ -32,7 +33,8 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { bool Init(const std::shared_ptr& parent_scope) { VLOG(3) << "Predictor::init()"; - + FLAGS_tensorrt_max_batch_size = config_.max_batch_size; + FLAGS_tensorrt_workspace_size = config_.workspace_size; if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); } else { @@ -150,3 +152,12 @@ CreatePaddlePredictor( } } // namespace paddle + +USE_TRT_CONVERTER(elementwise_add_weight); +USE_TRT_CONVERTER(mul); +USE_TRT_CONVERTER(conv2d); +USE_TRT_CONVERTER(relu); +USE_TRT_CONVERTER(fc); +USE_TRT_CONVERTER(pool2d); +USE_TRT_CONVERTER(softmax); +USE_TRT_CONVERTER(batch_norm); diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 794534467..da6c2cfc2 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -137,6 +137,14 @@ struct AnakinConfig : public PaddlePredictor::Config { struct TensorRTConfig : public NativeConfig { // Determine whether a subgraph will be executed by TRT. int min_subgraph_size{1}; + // While TensorRT allows an engine optimized for a given max batch size + // to run at any smaller size, the performance for those smaller + // sizes may not be as well-optimized. Therefore, Max batch is best + // equivalent to the runtime batch size. + int max_batch_size{1}; + // For workspace_size, refer it from here: + // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting + int workspace_size{1 << 30}; }; // A factory to help create different predictors. diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 4d930e9ce..1048d3017 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -22,6 +22,8 @@ namespace paddle { DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); +DEFINE_int32(tensorrt_max_batch_size, 1, "TensorRT maximum batch size"); +DEFINE_int32(tensorrt_workspace_size, 16 << 20, "TensorRT workspace size"); namespace operators { @@ -32,8 +34,6 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("engine_uniq_key", "unique key for the TRT engine."); - AddAttr("max_batch", "the maximum batch size."); - AddAttr("max_workspace", "the maximum batch size."); AddComment("TensorRT engine operator."); } }; diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index f2ec7f066..bc556ab36 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -28,6 +28,8 @@ namespace paddle { DECLARE_int32(tensorrt_engine_batch_size); +DECLARE_int32(tensorrt_max_batch_size); +DECLARE_int32(tensorrt_workspace_size); namespace operators { @@ -54,8 +56,10 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, "TensorRT' tensor input requires at most 4 dimensions"); - PADDLE_ENFORCE_EQ(shape.size(), 4UL); - return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); + PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); + if (shape.size() == 4UL) + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); + return nvinfer1::DimsCHW(shape[1], 1, 1); } } // namespace @@ -95,7 +99,7 @@ class TensorRTEngineKernel : public framework::OpKernel { auto input_names = context.op().Inputs("Xs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, - context.Attr("max_batch")); + FLAGS_tensorrt_max_batch_size); std::vector output_maps = context.Attr>("output_name_mapping"); @@ -132,7 +136,12 @@ class TensorRTEngineKernel : public framework::OpKernel { nvinfer1::ITensor* trt_t = engine->GetITensor(output_maps[output_index]); auto dims = trt_t->getDimensions(); // Use the output ITensor's dims to reshape the Fluid Tensor. - std::vector ddim(dims.d, dims.d + dims.nbDims); + // The ITensor doesn't contain the batch size dim. + std::vector ddim; + ddim.push_back(FLAGS_tensorrt_engine_batch_size); + for (int i = 0; i < dims.nbDims; i++) { + ddim.push_back(dims.d[i]); + } auto* fluid_v = context.scope().FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); @@ -168,8 +177,8 @@ class TensorRTEngineKernel : public framework::OpKernel { // Get the ProgramDesc and pass to convert. framework::proto::BlockDesc block_desc; block_desc.ParseFromString(context.Attr("subgraph")); - int max_batch = context.Attr("max_batch"); - auto max_workspace = context.Attr("max_workspace"); + int max_batch = FLAGS_tensorrt_max_batch_size; + auto max_workspace = FLAGS_tensorrt_workspace_size; auto params = context.Attr>("parameters"); std::unordered_set parameters; for (const auto& param : params) { diff --git a/paddle/fluid/operators/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt_engine_op_test.cc index 97c375361..27c1d2976 100644 --- a/paddle/fluid/operators/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/tensorrt_engine_op.h" #include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -57,6 +58,8 @@ void AddTensorToBlockDesc(framework::proto::BlockDesc* block, using inference::analysis::SetAttr; TEST(TensorRTEngineOp, manual) { + FLAGS_tensorrt_engine_batch_size = 2; + FLAGS_tensorrt_max_batch_size = 2; framework::ProgramDesc program; auto* block_ = program.Proto()->add_blocks(); block_->set_idx(0); @@ -98,8 +101,6 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetOutput("Ys", std::vector({"z0"})); SetAttr(engine_op_desc.Proto(), "subgraph", block_->SerializeAsString()); - SetAttr(engine_op_desc.Proto(), "max_batch", 100); - SetAttr(engine_op_desc.Proto(), "max_workspace", 1 << 10); SetAttr(engine_op_desc.Proto(), "engine_uniq_key", "a_engine"); SetAttr>(engine_op_desc.Proto(), "parameters", std::vector({})); @@ -128,6 +129,8 @@ TEST(TensorRTEngineOp, manual) { } void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { + FLAGS_tensorrt_engine_batch_size = batch_size; + FLAGS_tensorrt_max_batch_size = batch_size; framework::ProgramDesc program; framework::Scope scope; platform::CUDAPlace place; -- GitLab