From a0566010c431d09e3360cb970609a2330f8dadfe Mon Sep 17 00:00:00 2001 From: weishengying <63448337+weishengying@users.noreply.github.com> Date: Mon, 19 Sep 2022 19:56:40 +0800 Subject: [PATCH] Add symbolic shape deduction function for general Plugin mechanism (#46179) --- .../tensorrt/dynamic_shape_infermeta.cc | 54 +++++++++++++++++++ .../dynamic_shape_infermeta_registry.h | 2 + .../tensorrt/plugin/generic_plugin.cu | 38 ++++++++----- .../tensorrt/plugin/generic_plugin.h | 9 ++-- paddle/phi/core/kernel_context.h | 7 +++ .../test_trt_convert_instance_norm.py | 28 +++++++++- .../ir/inference/test_trt_convert_yolo_box.py | 27 ++++++++-- 7 files changed, 142 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc index 1d75f0a7fb..bba2e84e32 100644 --- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc +++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc @@ -54,7 +54,61 @@ nvinfer1::DimsExprs GatherNdInferMeta( } return output; } + +nvinfer1::DimsExprs YoloBoxInferMeta( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder, // NOLINT + const framework::OpDesc& op_desc) { + PADDLE_ENFORCE_EQ( + nb_inputs, + 2, + phi::errors::InvalidArgument("inputs of yolo_box should be equal to 2, " + "But received (%s)", + nb_inputs)); + + const nvinfer1::DimsExprs dim_x = inputs[0]; + + auto anchors = PADDLE_GET_CONST(std::vector, op_desc.GetAttr("anchors")); + int anchor_num = anchors.size() / 2; + + // box_num = dim_x[2] * dim_x[3] * anchor_num; + const nvinfer1::IDimensionExpr* box_num = expr_builder.operation( + nvinfer1::DimensionOperation::kPROD, + *expr_builder.operation( + nvinfer1::DimensionOperation::kPROD, *dim_x.d[2], *dim_x.d[3]), + *expr_builder.constant(anchor_num)); + + nvinfer1::DimsExprs output; + output.nbDims = 3; + if (output_index == 0) { + output.d[0] = dim_x.d[0]; + output.d[1] = box_num; + output.d[2] = expr_builder.constant(4); + } else { + auto class_num = PADDLE_GET_CONST(int, op_desc.GetAttr("class_num")); + output.d[0] = dim_x.d[0]; + output.d[1] = box_num; + output.d[2] = expr_builder.constant(class_num); + } + return output; +} + +nvinfer1::DimsExprs InstanceNormInferMeta( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder, // NOLINT + const framework::OpDesc& op_desc) { + nvinfer1::DimsExprs x_dims = inputs[0]; + return x_dims; +} + PD_REGISTER_DYNAMIC_INFER_META_FN(gather_nd, GatherNdInferMeta); +PD_REGISTER_DYNAMIC_INFER_META_FN(yolo_box, YoloBoxInferMeta); +PD_REGISTER_DYNAMIC_INFER_META_FN(instance_norm, InstanceNormInferMeta); + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h index f31040772c..0bc2ff78b6 100644 --- a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h +++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h @@ -21,6 +21,8 @@ namespace inference { namespace tensorrt { USE_TRT_DYNAMIC_INFER_META_FN(gather_nd); +USE_TRT_DYNAMIC_INFER_META_FN(yolo_box); +USE_TRT_DYNAMIC_INFER_META_FN(instance_norm); } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu index 2fc6e881e8..febabc6d8e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu @@ -216,6 +216,7 @@ void BuildPhiKernelContextAttr(const framework::OpDesc& op_desc, } } } + CHECK_EQ(attr_names.size(), kernel_context->AttrsSize()); } GenericPlugin::GenericPlugin( @@ -333,12 +334,16 @@ int GenericPlugin::initialize() TRT_NOEXCEPT { platform::CUDAPlace place(platform::GetCurrentDeviceId()); auto* dev_ctx = static_cast(pool.Get(place)); - phi_kernel_context_ = new phi::KernelContext(dev_ctx); - dense_tensor_inputs_ = new std::vector(getNbInputs()); - dense_tensor_outputs_ = new std::vector(getNbOutputs()); + if (!phi_kernel_context_) { + phi_kernel_context_ = new phi::KernelContext(dev_ctx); + BuildPhiKernelContextAttr( + op_desc_, phi_kernel_context_, phi_kernel_signature, phi_kernel); + } + if (!dense_tensor_inputs_) + dense_tensor_inputs_ = new std::vector(getNbInputs()); + if (!dense_tensor_outputs_) + dense_tensor_outputs_ = new std::vector(getNbOutputs()); - BuildPhiKernelContextAttr( - op_desc_, phi_kernel_context_, phi_kernel_signature, phi_kernel); return 0; } @@ -387,26 +392,28 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc, platform::CUDAPlace place(platform::GetCurrentDeviceId()); // [TODO]now generic plugin do not support FP16 and INT8 precision - auto protoType2PhiType = [](int proto_type) -> phi::DataType { + auto protoType2PhiType = [](int proto_type) -> std::pair { if (proto_type == static_cast(framework::proto::VarType_Type::VarType_Type_FP32)) - return phi::DataType::FLOAT32; + return {phi::DataType::FLOAT32, sizeof(float)}; else if (proto_type == static_cast( framework::proto::VarType_Type::VarType_Type_INT64) || proto_type == static_cast( framework::proto::VarType_Type::VarType_Type_INT32)) - return phi::DataType::INT32; + return {phi::DataType::INT32, sizeof(int32_t)}; else if (proto_type == static_cast( framework::proto::VarType_Type::VarType_Type_BOOL)) - return phi::DataType::BOOL; + return {phi::DataType::BOOL, sizeof(bool)}; else CHECK(false) << "precision is not supported"; }; // input + phi_kernel_context_->ClearInputOutput(); + for (int i = 0; i < getNbInputs(); i++) { auto const& input_dims = input_desc[i].dims; @@ -417,11 +424,12 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc, int input_numel = 1; for (int k = 0; k < input_shape.size(); k++) input_numel *= input_shape[k]; - phi::DenseTensorMeta input_meta(protoType2PhiType(inputs_data_type_[i]), + auto data_type_and_size = protoType2PhiType(inputs_data_type_[i]); + phi::DenseTensorMeta input_meta(data_type_and_size.first, phi::make_ddim(input_shape)); std::shared_ptr input_alloc( new phi::Allocation((void*)(inputs[i]), // NOLINT - input_numel * sizeof(int32_t), + input_numel * data_type_and_size.second, place)); (*dense_tensor_inputs_)[i] = std::move(phi::DenseTensor(input_alloc, input_meta)); @@ -440,11 +448,12 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc, for (int k = 0; k < output_shape.size(); k++) output_numel *= output_shape[k]; - phi::DenseTensorMeta output_meta(protoType2PhiType(outputs_data_type_[i]), + auto data_type_and_size = protoType2PhiType(inputs_data_type_[i]); + phi::DenseTensorMeta output_meta(data_type_and_size.first, phi::make_ddim(output_shape)); std::shared_ptr output_alloc( new phi::Allocation(reinterpret_cast(outputs[i]), - output_numel * sizeof(float), + output_numel * data_type_and_size.second, place)); phi::DenseTensor output_densetonsor(output_alloc, output_meta); (*dense_tensor_outputs_)[i] = @@ -452,6 +461,9 @@ int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc, phi_kernel_context_->EmplaceBackOutput(&((*dense_tensor_outputs_)[i])); } + CHECK_EQ(phi_kernel_context_->InputsSize(), getNbInputs()); + CHECK_EQ(phi_kernel_context_->OutputsSize(), getNbOutputs()); + (*phi_kernel_)(phi_kernel_context_); return cudaGetLastError() != cudaSuccess; diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h index 39730937af..5705078ffa 100644 --- a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h @@ -128,10 +128,11 @@ class GenericPlugin : public DynamicPluginTensorRT { framework::OpDesc op_desc_; private: - phi::KernelContext* phi_kernel_context_; - const phi::Kernel* phi_kernel_; - std::vector* dense_tensor_inputs_; - std::vector* dense_tensor_outputs_; + const phi::Kernel* phi_kernel_{nullptr}; + + phi::KernelContext* phi_kernel_context_{nullptr}; + std::vector* dense_tensor_inputs_{nullptr}; + std::vector* dense_tensor_outputs_{nullptr}; private: InputOutPutVarInfo in_out_info_; diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h index 7b79138fe7..107a1fe49c 100644 --- a/paddle/phi/core/kernel_context.h +++ b/paddle/phi/core/kernel_context.h @@ -144,6 +144,13 @@ class KernelContext { size_t OutputsSize() const { return outputs_.size(); } size_t AttrsSize() const { return attrs_.size(); } + void ClearInputOutput() { + inputs_.clear(); + input_range_.clear(); + outputs_.clear(); + output_range_.clear(); + } + private: DeviceContext* dev_ctx_; diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py index 56767b3457..a9eef0e296 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py @@ -20,6 +20,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set import unittest +import os class TrtConvertInstanceNormTest(TrtLayerAutoScanTest): @@ -113,7 +114,9 @@ class TrtConvertInstanceNormTest(TrtLayerAutoScanTest): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape or self.in_dim != 4: + if dynamic_shape: + return 1, 2 + if self.in_dim != 4: return 0, 3 return 1, 2 @@ -139,7 +142,30 @@ class TrtConvertInstanceNormTest(TrtLayerAutoScanTest): yield self.create_inference_config(), generate_trt_nodes_num( attrs, True), (1e-3, 1e-3) + def add_skip_trt_case(self): + + def teller1(program_config, predictor_config): + if len( + self.dynamic_shape.min_input_shape + ) != 0 and self.trt_param.precision == paddle_infer.PrecisionType.Half: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt in dynamic fp16 mode.") + + def teller2(program_config, predictor_config): + if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt': + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_SUPPORT, + "The output has diff between gpu and trt in Windows.") + def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py index fec4476939..001c1a1ccb 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py @@ -19,6 +19,7 @@ import paddle.inference as paddle_infer from functools import partial from typing import Optional, List, Callable, Dict, Any, Set import unittest +import os class TrtConvertYoloBoxTest(TrtLayerAutoScanTest): @@ -139,10 +140,7 @@ class TrtConvertYoloBoxTest(TrtLayerAutoScanTest): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if dynamic_shape == True: - return 0, 5 - else: - return 1, 4 + return 1, 4 attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) @@ -166,7 +164,26 @@ class TrtConvertYoloBoxTest(TrtLayerAutoScanTest): attrs, True), 1e-3 def add_skip_trt_case(self): - pass + + def teller1(program_config, predictor_config): + if len( + self.dynamic_shape.min_input_shape + ) != 0 and self.trt_param.precision == paddle_infer.PrecisionType.Half: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt in dynamic fp16 mode.") + + def teller2(program_config, predictor_config): + if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt': + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_SUPPORT, + "The output has diff between gpu and trt in Windows.") def test(self): self.add_skip_trt_case() -- GitLab