From 17a2003d69cfa82ff7058ab3d03e95e05d3cb2c2 Mon Sep 17 00:00:00 2001 From: zhoutianzi666 <39978853+zhoutianzi666@users.noreply.github.com> Date: Tue, 28 Jun 2022 11:11:11 +0800 Subject: [PATCH] [Inference TRT] elementwise layer support (#43851) * elementwise support * commit --- .../tensorrt/convert/elementwise_op.cc | 313 ++++++------------ .../inference/tensorrt/convert/op_converter.h | 302 ++++++++++++++--- paddle/fluid/inference/tensorrt/engine.h | 105 ++++-- 3 files changed, 427 insertions(+), 293 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 8fd0e1bbd06..2d342a6f704 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -19,236 +19,115 @@ namespace paddle { namespace inference { namespace tensorrt { -static bool CheckDims(const nvinfer1::Dims& dims_x, - const nvinfer1::Dims& dims_y) { - if (dims_x.nbDims != dims_y.nbDims) { - return false; - } - for (int i = 0; i < dims_x.nbDims; i++) { - if (dims_x.d[i] != dims_y.d[i]) { - return false; - } - } - return true; -} - -class ElementwiseWeightOpConverter : public OpConverter { +class ElementwiseTensorOpConverter : public OpConverter { public: - ElementwiseWeightOpConverter() {} + ElementwiseTensorOpConverter() {} void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope, bool test_mode) override { - // Here the two nullptr looks strange, that's because the - // framework::OpDesc's constructor is strange. - nvinfer1::ILayer* layer = nullptr; + const framework::Scope& scope, + bool test_mode) override { + VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; framework::OpDesc op_desc(op, nullptr); - VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; - auto* X = engine_->GetITensor(op_desc.Input("X").front()); + nvinfer1::ITensor* Y = nullptr; auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); - PADDLE_ENFORCE_NOT_NULL( - Y_v, platform::errors::NotFound("Variable %s not found in scope.", - op_desc.Input("Y").front().c_str())); - auto* Y_t = Y_v->GetMutable(); - float* weight_data = nullptr; - auto output_name = op_desc.Output("Out")[0]; - weight_data = engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t); - nvinfer1::Dims dims_x = X->getDimensions(); - - auto regist_eltwise_weight = [&](nvinfer1::ScaleMode scale_mode) { - TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, - static_cast(weight_data), - static_cast(Y_t->numel())}; - TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr, - 0}; - TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, - 0}; - - nvinfer1::IShuffleLayer* expand_layer = nullptr; - nvinfer1::IShuffleLayer* squeeze_layer = nullptr; - int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0; - auto input_dim = X->getDimensions(); - if (input_dim.nbDims < 3 + dynamic_shape_offset) { - nvinfer1::Dims expand_shape; - expand_shape.nbDims = 3 + dynamic_shape_offset; - for (int i = 0; i < expand_shape.nbDims; i++) { - if (i < input_dim.nbDims) { - expand_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i]; - } else { - expand_shape.d[i] = 1; - } - } - expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - expand_layer->setReshapeDimensions(expand_shape); - X = expand_layer->getOutput(0); - expand_layer->getOutput(0)->setName( - ("elementwise_reshape_out: " + output_name).c_str()); - expand_layer->setName( - ("Elewise: Shuffle: (Output: " + output_name + ")").c_str()); - } - if (op_type_ == "add") { - nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( - engine_, ScaleNd, *X, scale_mode, shift_weights.get(), - scale_weights.get(), power_weights.get(), dynamic_shape_offset); - layer = scale_layer; - } else if (op_type_ == "mul") { - nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *X, scale_mode, scale_weights.get(), - shift_weights.get(), power_weights.get()); - layer = scale_layer; - } - if (input_dim.nbDims < 3 + dynamic_shape_offset) { - nvinfer1::Dims squeeze_shape; - squeeze_shape.nbDims = input_dim.nbDims; - for (int i = 0; i < squeeze_shape.nbDims; i++) { - squeeze_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i]; - } - squeeze_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); - squeeze_layer->setReshapeDimensions(squeeze_shape); - RreplenishLayerAndOutput(squeeze_layer, "elementwise_" + op_type_, - {output_name}, test_mode); - } else { - RreplenishLayerAndOutput(layer, "elementwise_" + op_type_, - {output_name}, test_mode); - } - }; - - if (engine_->with_dynamic_shape()) { - if (Y_t->dims().size() == 1) { - auto scale_mode = nvinfer1::ScaleMode::kCHANNEL; - PADDLE_ENFORCE_EQ(Y_t->dims()[0], dims_x.d[1], - platform::errors::InvalidArgument( - "The Bias's size(%d) should be equal to the " - "first dim(%d) of the Input.", - Y_t->dims()[0], dims_x.d[1])); - regist_eltwise_weight(scale_mode); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "The size of input bias's dims is %d, but TensorRT dynamic shape " - "only support size = 1 for Elementwise op!", - Y_t->dims().size())); + if (Y_v) { + // Y is weight + auto* Y_t = Y_v->GetMutable(); + float* weight_data = + engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t); + std::vector dims_y = phi::vectorize(Y_t->dims()); + TensorRTEngine::Weight y_weight{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + static_cast(Y_t->numel())}; + nvinfer1::Dims trt_dims_y; + trt_dims_y.nbDims = dims_y.size(); + for (int i = 0; i < trt_dims_y.nbDims; i++) { + trt_dims_y.d[i] = dims_y[i]; } - return; + Y = TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims_y, y_weight.get()) + ->getOutput(0); + } else { + Y = engine_->GetITensor(op_desc.Input("Y").front()); } - std::vector no_batch_dims; - int start_index = 0; - - for (; start_index < dims_x.nbDims; start_index++) - no_batch_dims.push_back(dims_x.d[start_index]); - - auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; + if (X->getDimensions().nbDims < Y->getDimensions().nbDims) { + auto* tmp = X; + X = Y; + Y = tmp; + } + nvinfer1::Dims dims_x = X->getDimensions(); + nvinfer1::Dims dims_y = Y->getDimensions(); + auto output_name = op_desc.Output("Out")[0]; - std::vector dims_y = phi::vectorize(Y_t->dims()); - if (dims_y.size() == no_batch_dims.size() + 1) { - if (dims_y[0] == 1) dims_y.erase(dims_y.begin()); + // axis here is relative to explicit batch + int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); + int real_x_rank = dims_x.nbDims; + int real_y_rank = dims_y.nbDims; + if (!engine_->with_dynamic_shape()) { + real_x_rank++; + real_y_rank++; + if (Y_v) real_y_rank--; + } + if (axis == -1) { + axis = real_x_rank - real_y_rank; + } + if (!engine_->with_dynamic_shape() && axis > 0) { + axis--; } - if (dims_y.size() == 1 && dims_y[0] == no_batch_dims[0]) { - scale_mode = nvinfer1::ScaleMode::kCHANNEL; - } else if (dims_y.size() == no_batch_dims.size() && - dims_y[0] == no_batch_dims[0]) { - scale_mode = nvinfer1::ScaleMode::kELEMENTWISE; - for (size_t i = 1; i < no_batch_dims.size(); i++) { - if (dims_y[i] != no_batch_dims[i]) { - scale_mode = nvinfer1::ScaleMode::kCHANNEL; - break; + // X: - - - - - - - + // axis + // Y: - - - + // we need expand Y's rank = X's rank + int left_one_num = axis; + int right_one_num = dims_x.nbDims - axis - dims_y.nbDims; + nvinfer1::IShuffleLayer* reshape_layer; + nvinfer1::ITensor* reshape_y_tensor; + if (left_one_num > 0 || right_one_num > 0) { + if (engine_->with_dynamic_shape()) { + auto* y_shape_tensor = Shape(Y); + auto* new_y_shape_tensor = y_shape_tensor; + if (axis > 0) { + std::vector left_one(left_one_num, 1); + auto* left_one_tensor = Add1DConstantLayer(left_one); + new_y_shape_tensor = Concat(std::vector{ + left_one_tensor, new_y_shape_tensor}); } - } - if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) { - for (size_t i = 1; i < no_batch_dims.size(); i++) { - if (dims_y[i] != 1) - PADDLE_THROW(platform::errors::InvalidArgument( - "The bias's %d dim is %d, but TensorRT dynamic shape only " - "support it equals to 1 for Elementwise op!", - i, dims_y[i])); + if (right_one_num > 0) { + std::vector right_one(right_one_num, 1); + auto* right_one_tensor = Add1DConstantLayer(right_one); + new_y_shape_tensor = Concat(std::vector{ + new_y_shape_tensor, right_one_tensor}); } - } - } else { - if (dims_y.size() >= 1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The size of bias's dims is %d and bias's size is %d. TensorRT " - "doesn't support this shape for Elementwise op!", - dims_y.size(), dims_y[0])); + reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *Y); + reshape_layer->setInput(1, *new_y_shape_tensor); } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "The size of bias's dims is %d. TensorRT doesn't support " - "this shape for Elementwise op!", - dims_y.size())); + nvinfer1::Dims new_y_dims; + new_y_dims.nbDims = left_one_num + dims_y.nbDims + right_one_num; + for (int i = 0; i < new_y_dims.nbDims; i++) new_y_dims.d[i] = 1; + for (int i = 0; i < dims_y.nbDims; i++) + new_y_dims.d[left_one_num + i] = dims_y.d[i]; + reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *Y); + reshape_layer->setReshapeDimensions(new_y_dims); } + reshape_y_tensor = reshape_layer->getOutput(0); + } else { + // In fact , we can remove this `else`, but -> rt_resnet50_test CI in trt + // 6015 faling, how ridiculous! + reshape_y_tensor = Y; } - regist_eltwise_weight(scale_mode); - } - - protected: - std::string op_type_; -}; -class ElementwiseTensorOpConverter : public OpConverter { - public: - ElementwiseTensorOpConverter() {} - void operator()(const framework::proto::OpDesc& op, - const framework::Scope& scope, bool test_mode) override { auto op_pair = ops.find(op_type_); - PADDLE_ENFORCE_NE(op_pair, ops.end(), + PADDLE_ENFORCE_NE(op_pair, + ops.end(), platform::errors::InvalidArgument( "Elementwise op's type(%s) is not supported. Please " "check if the op_type is correct.", op_type_)); - // Here the two nullptr looks strange, that's because the - // framework::OpDesc's constructor is strange. - framework::OpDesc op_desc(op, nullptr); - nvinfer1::ILayer* layer = nullptr; - - auto* X = engine_->GetITensor(op_desc.Input("X").front()); - auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); - std::vector itensors; - itensors.push_back(X); - itensors.push_back(Y); - nvinfer1::Dims dims_x = X->getDimensions(); - nvinfer1::Dims dims_y = Y->getDimensions(); - - int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); - auto output_name = op_desc.Output("Out")[0]; - - auto common_func = [&](nvinfer1::ILayer* layer) { - RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode); - }; - - if (dims_x.nbDims == dims_y.nbDims) { - // The two input tensor should have the same dims - VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; - nvinfer1::IElementWiseLayer* elet_layer = - TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *X, *Y, op_pair->second); - - layer = elet_layer; - } else { - VLOG(3) << "Convert a fluid elementwise op to TensorRT " - "ElementWisePluginLayer"; - if (engine_->with_dynamic_shape()) { -#if IS_TRT_VERSION_GE(6000) - plugin::ElementwisePluginDynamic* plugin = - new plugin::ElementwisePluginDynamic(op_type_, axis); - layer = engine_->AddDynamicPlugin(itensors.data(), 2, plugin); -#else - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); -#endif - } else { - plugin::ElementWisePlugin* plugin = - new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); - - std::vector inputs{X, Y}; - auto* plugin_layer = engine_->AddPlugin( - inputs.data(), inputs.size(), - reinterpret_cast(plugin)); - - layer = plugin_layer; - } - } - common_func(layer); + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *X, *reshape_y_tensor, op_pair->second); + RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode); } protected: @@ -268,16 +147,6 @@ const std::unordered_map {"max", nvinfer1::ElementWiseOperation::kMAX}, }; -class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter { - public: - ElementwiseWeightAddOpConverter() { op_type_ = "add"; } -}; - -class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter { - public: - ElementwiseWeightMulOpConverter() { op_type_ = "mul"; } -}; - class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter { public: ElementwiseTensorAddOpConverter() { op_type_ = "add"; } @@ -318,9 +187,15 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter { } // namespace paddle REGISTER_TRT_OP_CONVERTER(elementwise_add_weight, - ElementwiseWeightAddOpConverter); + ElementwiseTensorAddOpConverter); REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight, - ElementwiseWeightMulOpConverter); + ElementwiseTensorMulOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_sub_weight, + ElementwiseTensorSubOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_div_weight, + ElementwiseTensorDivOpConverter); +REGISTER_TRT_OP_CONVERTER(elementwise_pow_weight, + ElementwiseTensorPowOpConverter); REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor, ElementwiseTensorAddOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index f7eb7f859af..d179e8bb34c 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" @@ -46,14 +47,16 @@ class OpConverter { // test_mode: whether the instance executes in an unit test. void ConvertOp(const framework::proto::OpDesc& op, const std::unordered_set& parameters, - const framework::Scope& scope, TensorRTEngine* engine, + const framework::Scope& scope, + TensorRTEngine* engine, bool test_mode = false) { framework::OpDesc op_desc(op, nullptr); OpConverter* it{nullptr}; if (op_desc.Type() == "mul") { - PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL, + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), + 1UL, platform::errors::InvalidArgument( "The input op mul's Input(\"Y\")." "size() should equal to 1, but reveceid " @@ -67,11 +70,10 @@ class OpConverter { if (op_desc.Type().find("elementwise") != std::string::npos) { static std::unordered_set add_tensor_op_set{ "add", "mul", "sub", "div", "max", "min", "pow"}; - // TODO(xingzhaolong): all mul, sub, div - // static std::unordered_set add_weight_op_set {"add", "mul", - // "sub", "div"}; - static std::unordered_set add_weight_op_set{"add", "mul"}; - PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL, + static std::unordered_set add_weight_op_set{ + "add", "mul", "sub", "div", "pow"}; + PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), + 1UL, platform::errors::InvalidArgument( "The input op's Input(\"Y\")." "size() should equal to 1, but reveceid " @@ -82,64 +84,74 @@ class OpConverter { std::string Y = op_desc.Input("Y")[0]; if (parameters.count(Y)) { PADDLE_ENFORCE_GT( - add_weight_op_set.count(op_type), 0, + add_weight_op_set.count(op_type), + 0, platform::errors::Unimplemented("Unsupported elementwise type %s", op_type.c_str())); it = Registry::Global().Lookup("elementwise_" + op_type + "_weight"); PADDLE_ENFORCE_NOT_NULL( - it, platform::errors::Unimplemented( - "no OpConverter for optype [%s]", op_desc.Type())); + it, + platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); } else { PADDLE_ENFORCE_GT( - add_tensor_op_set.count(op_type), 0, + add_tensor_op_set.count(op_type), + 0, platform::errors::Unimplemented("Unsupported elementwise type %s", op_type.c_str())); it = Registry::Global().Lookup("elementwise_" + op_type + "_tensor"); } PADDLE_ENFORCE_NOT_NULL( - it, platform::errors::Unimplemented("no OpConverter for optype [%s]", - op_desc.Type())); + it, + platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); } if (op_desc.Type() == "depthwise_conv2d") { it = Registry::Global().Lookup("conv2d"); PADDLE_ENFORCE_NOT_NULL( - it, platform::errors::Unimplemented("no OpConverter for optype [%s]", - op_desc.Type())); + it, + platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); } if (op_desc.Type() == "depthwise_conv2d_transpose") { it = Registry::Global().Lookup("conv2d_transpose"); PADDLE_ENFORCE_NOT_NULL( - it, platform::errors::Unimplemented("no OpConverter for optype [%s]", - op_desc.Type())); + it, + platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); } if (op_desc.Type() == "transpose2") { it = Registry::Global().Lookup("transpose"); PADDLE_ENFORCE_NOT_NULL( - it, platform::errors::Unimplemented("no OpConverter for optype [%s]", - op_desc.Type())); + it, + platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); } if (op_desc.Type() == "flatten2") { it = Registry::Global().Lookup("flatten"); PADDLE_ENFORCE_NOT_NULL( - it, platform::errors::Unimplemented("no OpConverter for optype [%s]", - op_desc.Type())); + it, + platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); } // reshape2 == reshape if (op_desc.Type() == "reshape2") { it = Registry::Global().Lookup("reshape"); PADDLE_ENFORCE_NOT_NULL( - it, platform::errors::Unimplemented("no OpConverter for optype [%s]", - op_desc.Type())); + it, + platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); } if (!it) { it = Registry::Global().Lookup(op_desc.Type()); } PADDLE_ENFORCE_NOT_NULL( - it, platform::errors::Unimplemented("no OpConverter for optype [%s]", - op_desc.Type())); + it, + platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); it->SetEngine(engine); (*it)(op, scope, test_mode); @@ -215,7 +227,8 @@ class OpConverter { // the INetwork's inputs and outputs should specified in some other modules. void ConvertBlock(const framework::proto::BlockDesc& block, const std::unordered_set& parameters, - const framework::Scope& scope, TensorRTEngine* engine) { + const framework::Scope& scope, + TensorRTEngine* engine) { std::unique_lock lk(mut_); for (int i = 0; i < block.ops_size(); i++) { const auto& op = block.ops(i); @@ -225,20 +238,24 @@ class OpConverter { // The scope here should be inited with the parameter vars. void ConvertBlockToTRTEngine( - framework::BlockDesc* block_desc, const framework::Scope& scope, + framework::BlockDesc* block_desc, + const framework::Scope& scope, const std::vector& inputs, const std::unordered_set& parameters, - const std::vector& outputs, TensorRTEngine* engine) { + const std::vector& outputs, + TensorRTEngine* engine) { engine->InitNetwork(); bool all_dynamic_shape_set = true; for (auto& input : inputs) { if (parameters.count(input)) continue; auto* var = block_desc->FindVar(input); PADDLE_ENFORCE_NOT_NULL( - var, platform::errors::NotFound("no variable called %s in block.", - input.c_str())); + var, + platform::errors::NotFound("no variable called %s in block.", + input.c_str())); PADDLE_ENFORCE_EQ( - var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + var->GetType(), + FluidDT::VarType_Type_LOD_TENSOR, platform::errors::InvalidArgument("TensorRT engine only takes " "LoDTensor as input")); auto var_shape = var->GetShape(); @@ -263,25 +280,29 @@ class OpConverter { } else { input_shape.push_back(min_input_shape[i]); // the i dimension should be same. - PADDLE_ENFORCE_EQ(min_input_shape[i], optim_input_shape[i], + PADDLE_ENFORCE_EQ(min_input_shape[i], + optim_input_shape[i], platform::errors::InvalidArgument( "The dim (%d) of the min_input_shape and " "optim_input_shape should be same.")); } } engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), + input, + FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), Vec2TRT_Dims(input_shape, input, true)); #endif } else { engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), + input, + FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), Vec2TRT_Dims(var_shape, input)); } } - PADDLE_ENFORCE_EQ(all_dynamic_shape_set, true, + PADDLE_ENFORCE_EQ(all_dynamic_shape_set, + true, platform::errors::InvalidArgument( "some trt inputs dynamic shape info not set, " "check the INFO log above for more details.")); @@ -294,20 +315,221 @@ class OpConverter { engine->ClearWeights(); } + // rank(result) = rank(input) + nvinfer1::ITensor* Gather(nvinfer1::ITensor* input, + const std::vector indices, + int axis = 0) { + auto* indices_tensor = Add1DConstantLayer(indices, " "); + auto* result = + TRT_ENGINE_ADD_LAYER(engine_, Gather, *input, *indices_tensor, axis) + ->getOutput(0); + return result; + } + + // paddle allows negative index + // for axis length = 5, paddle allows [-5, 4] + nvinfer1::ITensor* FixNegIndices(nvinfer1::ITensor* input_shape, + nvinfer1::ITensor* indices) { + int rank = input_shape->getDimensions().nbDims; + std::vector zero = std::vector(rank, 0); + std::vector minus_one = std::vector(rank, -1); + nvinfer1::ITensor* zero_tensor = Add1DConstantLayer(zero); + nvinfer1::ITensor* minus_one_tensor = Add1DConstantLayer(minus_one); + // -1, 0 + auto* sign = Max(Min(indices, zero_tensor), minus_one_tensor); + return Sub(indices, Prod(sign, input_shape)); + } + + nvinfer1::ITensor* Shape(nvinfer1::ITensor* input) { + return TRT_ENGINE_ADD_LAYER(engine_, Shape, *input)->getOutput(0); + } + + // Concat not make rank changed + nvinfer1::ITensor* Concat(const std::vector& inputs, + int axis = 0) { + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, Concatenation, inputs.data(), inputs.size()); + if (axis != 0) layer->setAxis(axis); + nvinfer1::ITensor* c = layer->getOutput(0); + return c; + } + + nvinfer1::ITensor* Sum(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kSUM) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Prod(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kPROD) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Min(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kMIN) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Max(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kMAX) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Sub(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kSUB) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Div(nvinfer1::ITensor* a, nvinfer1::ITensor* b) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER( + engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kDIV) + ->getOutput(0); + return c; + } + + nvinfer1::ITensor* Act(nvinfer1::ITensor* a, + nvinfer1::ActivationType act_type) { + nvinfer1::ITensor* c = + TRT_ENGINE_ADD_LAYER(engine_, Activation, *a, act_type)->getOutput(0); + return c; + } + + // Get element tensor of 1D shape tensor + nvinfer1::ITensor* GetEleTensorOfShape(nvinfer1::ITensor* shape_tensor, + int index, + bool is_scalar = false) { + auto* tensor = + TRT_ENGINE_ADD_LAYER(engine_, + Gather, + *shape_tensor, + *Add1DConstantLayer(index, " ", is_scalar), + 0) + ->getOutput(0); + return tensor; + } + + // Create and add Multi-D constant float layer + nvinfer1::ITensor* AddConstantLayer(const float* data, + const std::vector& weight_dims, + const std::string& weight_name) { + std::unique_ptr tmp_tensor(new framework::Tensor()); + int data_size = std::accumulate( + weight_dims.begin(), weight_dims.end(), 1, std::multiplies()); + tmp_tensor->Resize({data_size}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < data_size; i++) { + tmp_data[i] = data[i]; + } + engine_->SetWeights(weight_name, std::move(tmp_tensor)); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(tmp_data), + static_cast(data_size)}; + nvinfer1::Dims trt_dims; + trt_dims.nbDims = weight_dims.size(); + for (size_t i = 0; i < weight_dims.size(); i++) + trt_dims.d[i] = weight_dims[i]; + auto const_layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims, weight.get()); + return const_layer->getOutput(0); + } + + // Create and add 1D constant float layer + nvinfer1::ITensor* Add1DConstantLayer(const std::vector& data, + const std::string& weight_name = "", + bool scalar = false) { + std::unique_ptr tmp_tensor(new framework::Tensor()); + int data_size = data.size(); + tmp_tensor->Resize({data_size}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < data_size; i++) { + tmp_data[i] = data[i]; + } + engine_->SetWeights(weight_name, std::move(tmp_tensor)); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(tmp_data), + static_cast(data_size)}; + nvinfer1::Dims input_shape; + input_shape.nbDims = scalar ? 0 : 1; + input_shape.d[0] = data_size; + auto const_layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get()); + return const_layer->getOutput(0); + } + + // Create and add 1D constant layer + nvinfer1::ITensor* Add1DConstantLayer(const std::vector& data, + const std::string& weight_name = "", + bool scalar = false) { + std::unique_ptr tmp_tensor(new framework::Tensor()); + int data_size = data.size(); + tmp_tensor->Resize({data_size}); + auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); + for (int i = 0; i < data_size; i++) { + tmp_data[i] = data[i]; + } + engine_->SetWeights(weight_name, std::move(tmp_tensor)); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32, + static_cast(tmp_data), + static_cast(data_size)}; + nvinfer1::Dims input_shape; + input_shape.nbDims = scalar ? 0 : 1; + input_shape.d[0] = data_size; + auto const_layer = + TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get()); + return const_layer->getOutput(0); + } + + nvinfer1::ITensor* Add1DConstantLayer(nvinfer1::Dims data, + const std::string& weight_name = "", + bool scalar = false) { + std::vector tmp_data; + for (int i = 0; i < data.nbDims; i++) tmp_data.push_back(data.d[i]); + return Add1DConstantLayer(tmp_data, weight_name, scalar); + } + + nvinfer1::ITensor* Add1DConstantLayer(int32_t data, + const std::string& weight_name = "", + bool scalar = false) { + std::vector tmp_data; + tmp_data.push_back(data); + return Add1DConstantLayer(tmp_data, weight_name, scalar); + } + void RreplenishLayerAndOutput( - nvinfer1::ILayer* layer, const std::string& layer_type, + nvinfer1::ILayer* layer, + const std::string& layer_type, const std::vector& output_tensor_names, bool test_mode = false) { size_t num_out = output_tensor_names.size(); + std::string layer_name = layer_type + " (Output: "; for (size_t i = 0; i < num_out; i++) { layer->getOutput(i)->setName(output_tensor_names[i].c_str()); engine_->SetITensor(output_tensor_names[i], layer->getOutput(i)); if (test_mode) { engine_->DeclareOutput(output_tensor_names[i]); } + layer_name += output_tensor_names[i]; + if (i != num_out - 1) layer_name += ", "; } - layer->setName( - (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str()); + layer->setName((layer_name + ")").c_str()); } void SetEngine(TensorRTEngine* engine) { engine_ = engine; } diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index f781cd0cb3a..b4a0478925b 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -66,13 +66,16 @@ TRT_DT FluidDataType2TRT(FluidDT type) { // The T can be int32 or int64 type. template -nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, +nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, + std::string input, bool with_dynamic_shape = false) { - PADDLE_ENFORCE_GT(shape.size(), 0UL, + PADDLE_ENFORCE_GT(shape.size(), + 0UL, platform::errors::InvalidArgument( "TensorRT's tensor input requires at least 1 " "dimensions, but input %s has %d dims.", - input, shape.size())); + input, + shape.size())); auto ShapeStr = [](const std::vector& shape) { std::ostringstream os; @@ -93,7 +96,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, PADDLE_THROW(platform::errors::InvalidArgument( "The input [%s] shape of trt subgraph is %s, please enable " "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", - input, ShapeStr(shape))); + input, + ShapeStr(shape))); } return nvinfer1::Dims3(shape[1], shape[2], shape[3]); } else if (shape.size() == 5UL) { @@ -101,7 +105,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, PADDLE_THROW(platform::errors::InvalidArgument( "The input [%s] shape of trt subgraph is %s, please enable " "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", - input, ShapeStr(shape))); + input, + ShapeStr(shape))); } return nvinfer1::Dims4(shape[1], shape[2], shape[3], shape[4]); } else if (shape.size() == 3UL) { @@ -109,7 +114,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, PADDLE_THROW(platform::errors::InvalidArgument( "The input [%s] shape of trt subgraph is %s, please enable " "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", - input, ShapeStr(shape))); + input, + ShapeStr(shape))); } return nvinfer1::Dims2(shape[1], shape[2]); } else if (shape.size() == 2UL) { @@ -117,7 +123,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, PADDLE_THROW(platform::errors::InvalidArgument( "The input [%s] shape of trt subgraph is %s, please enable " "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", - input, ShapeStr(shape))); + input, + ShapeStr(shape))); } nvinfer1::Dims dims; dims.nbDims = 1; @@ -125,11 +132,13 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, return dims; } // static shape doesn't support 1D op so far. - PADDLE_ENFORCE_NE(shape.size(), 1UL, + PADDLE_ENFORCE_NE(shape.size(), + 1UL, platform::errors::InvalidArgument( "The input [%s] shape of trt subgraph is %s." "it's not supported by trt so far", - input, ShapeStr(shape))); + input, + ShapeStr(shape))); nvinfer1::Dims dims; dims.nbDims = shape.size() - 1; @@ -151,7 +160,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, return dims; } } -} // NOLINT +} // namespace class TRTInt8Calibrator; @@ -184,9 +193,11 @@ class TensorRTEngine { }; TensorRTEngine( - int max_batch, int max_workspace, + int max_batch, + int max_workspace, AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, - TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + TRTInt8Calibrator* calibrator = nullptr, + int device_id = 0, const ShapeMapType min_input_shape = {}, const ShapeMapType max_input_shape = {}, const ShapeMapType optim_input_shape = {}, @@ -205,17 +216,21 @@ class TensorRTEngine { if (min_input_shape_.size() != 0 && max_input_shape_.size() != 0 && optim_input_shape_.size() != 0) { PADDLE_ENFORCE_EQ( - min_input_shape_.size(), max_input_shape_.size(), + min_input_shape_.size(), + max_input_shape_.size(), platform::errors::InvalidArgument( "The min_input_shape_'s size(%d) should be equal to the " "size(%d) of max_input_shape_", - min_input_shape_.size(), max_input_shape_.size())); + min_input_shape_.size(), + max_input_shape_.size())); PADDLE_ENFORCE_EQ( - min_input_shape_.size(), optim_input_shape_.size(), + min_input_shape_.size(), + optim_input_shape_.size(), platform::errors::InvalidArgument( "The min_input_shape_'s size(%d) should be equal to the " "size(%d) of optim_input_shape_", - min_input_shape_.size(), optim_input_shape_.size())); + min_input_shape_.size(), + optim_input_shape_.size())); #if IS_TRT_VERSION_GE(6000) with_dynamic_shape_ = true; #else @@ -242,7 +257,8 @@ class TensorRTEngine { const nvinfer1::Dims& dim); // Set the offset-th output from a layer as the network's output, and set its // name. - void DeclareOutput(const nvinfer1::ILayer* layer, int offset, + void DeclareOutput(const nvinfer1::ILayer* layer, + int offset, const std::string& name); // Set the itensor_map_[name] as the network's output, and set its name. void DeclareOutput(const std::string& name); @@ -374,7 +390,8 @@ class TensorRTEngine { int GetDeviceId() { return device_id_; } nvinfer1::IPluginV2Layer* AddPlugin(nvinfer1::ITensor* const* inputs, - int num_inputs, plugin::PluginTensorRT*); + int num_inputs, + plugin::PluginTensorRT*); nvinfer1::IPluginV2Layer* AddPluginV2Ext(nvinfer1::ITensor* const* inputs, int num_inputs, @@ -431,7 +448,8 @@ class TensorRTEngine { // After finishing adding ops, freeze this network and creates the execution // environment. void FreezeNetwork(); - void Execute(int batch_size, std::vector* buffers, + void Execute(int batch_size, + std::vector* buffers, cudaStream_t stream = nullptr); nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } @@ -448,15 +466,20 @@ class TensorRTEngine { auto name = it.first; auto input_shape = it.second; PADDLE_ENFORCE_EQ( - min_input_shape_.count(name), true, + min_input_shape_.count(name), + true, platform::errors::InvalidArgument( "TRT dynamic_shape min_input_shape %s not found.", name)); - PADDLE_ENFORCE_EQ(min_input_shape_[name].size(), input_shape.size(), + PADDLE_ENFORCE_EQ(min_input_shape_[name].size(), + input_shape.size(), platform::errors::InvalidArgument( "TRT dynamic_shape min_input_shape %s size not " "equal, the min_input_shape[%s].size()=%d" ", but the runtime_input_shape[%s].size()=%d.", - name, name, min_input_shape_[name].size(), name, + name, + name, + min_input_shape_[name].size(), + name, input_shape.size())); auto bak_min_shape = min_input_shape_[name]; auto bak_max_shape = max_input_shape_[name]; @@ -497,7 +520,8 @@ class TensorRTEngine { #if IS_TRT_VERSION_GE(6000) nvinfer1::IPluginV2Layer* AddDynamicPlugin( - nvinfer1::ITensor* const* inputs, int num_inputs, + nvinfer1::ITensor* const* inputs, + int num_inputs, plugin::DynamicPluginTensorRT* plugin) { owned_pluginv2_.emplace_back(plugin); return network()->addPluginV2(inputs, num_inputs, *plugin); @@ -524,7 +548,8 @@ class TensorRTEngine { void Set(const std::string& attr_name, AttrType* attr) { if (attrs_.count(attr_name) == 0) { PADDLE_ENFORCE_EQ( - attrs_.count(attr_name), 0, + attrs_.count(attr_name), + 0, platform::errors::AlreadyExists( "Attribute %s already set in trt engine.", attr_name)); } else { @@ -543,7 +568,8 @@ class TensorRTEngine { template void SetNotOwned(const std::string& attr_name, AttrType* attr) { PADDLE_ENFORCE_EQ( - attrs_.count(attr_name), 0, + attrs_.count(attr_name), + 0, platform::errors::AlreadyExists( "Attribute %s already set in trt engine.", attr_name)); attrs_[attr_name] = attr; @@ -552,7 +578,8 @@ class TensorRTEngine { // Get a reference to the attributed previously set. template AttrType& Get(const std::string& attr_name) const { - PADDLE_ENFORCE_NE(attrs_.find(attr_name), attrs_.end(), + PADDLE_ENFORCE_NE(attrs_.find(attr_name), + attrs_.end(), platform::errors::InvalidArgument( "Attribute %s not found in trt engine.", attr_name)); try { @@ -574,7 +601,8 @@ class TensorRTEngine { }; PADDLE_THROW(platform::errors::InvalidArgument( - "Invalid type for attritube %s, expected: %s, actual: %s.", attr_name, + "Invalid type for attritube %s, expected: %s, actual: %s.", + attr_name, TypeToString(typeid(AttrType*)), TypeToString(attrs_.at(attr_name).type()))); } @@ -672,7 +700,7 @@ class TensorRTEngine { // them, and an macro like this is more extensible when underlying TensorRT // library add new layer supports. #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \ - engine__->network()->add##layer__(__VA_ARGS__); + engine__->network()->add##layer__(__VA_ARGS__) class TRTEngineManager { public: @@ -687,18 +715,27 @@ class TRTEngineManager { } TensorRTEngine* Create( - std::string name, int max_batch, int max_workspace, + std::string name, + int max_batch, + int max_workspace, AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32, - TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, + TRTInt8Calibrator* calibrator = nullptr, + int device_id = 0, const std::map> min_input_shape = {}, const std::map> max_input_shape = {}, const std::map> optim_input_shape = {}, bool disable_trt_plugin_fp16 = false, nvinfer1::ILogger& logger = NaiveLogger::Global()) { - auto* p = - new TensorRTEngine(max_batch, max_workspace, precision, calibrator, - device_id, min_input_shape, max_input_shape, - optim_input_shape, disable_trt_plugin_fp16, logger); + auto* p = new TensorRTEngine(max_batch, + max_workspace, + precision, + calibrator, + device_id, + min_input_shape, + max_input_shape, + optim_input_shape, + disable_trt_plugin_fp16, + logger); engines_[name].reset(p); return p; } -- GitLab