[Inference TRT] elementwise layer support (#43851)

* elementwise support * commit

[Inference TRT] elementwise layer support (#43851)
* elementwise support * commit
17a2003d · zhoutianzi666 · GitHub · ff70a269 · 17a2003d · 17a2003d
3 changed file
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -19,236 +19,115 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
-static bool CheckDims(const nvinfer1::Dims& dims_x,
+class ElementwiseTensorOpConverter : public OpConverter {
-                      const nvinfer1::Dims& dims_y) {
-  if (dims_x.nbDims != dims_y.nbDims) {
-    return false;
-  }
-  for (int i = 0; i < dims_x.nbDims; i++) {
-    if (dims_x.d[i] != dims_y.d[i]) {
-      return false;
-    }
-  }
-  return true;
-}
-class ElementwiseWeightOpConverter : public OpConverter {
 public:
-  ElementwiseWeightOpConverter() {}
+  ElementwiseTensorOpConverter() {}
  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
+                  const framework::Scope& scope,
-    // Here the two nullptr looks strange, that's because the
+                  bool test_mode) override {
-    // framework::OpDesc's constructor is strange.
+    VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-    nvinfer1::ILayer* layer = nullptr;
    framework::OpDesc op_desc(op, nullptr);
-    VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer";
    auto* X = engine_->GetITensor(op_desc.Input("X").front());
+    nvinfer1::ITensor* Y = nullptr;
    auto* Y_v = scope.FindVar(op_desc.Input("Y").front());
-    PADDLE_ENFORCE_NOT_NULL(
+    if (Y_v) {
-        Y_v, platform::errors::NotFound("Variable %s not found in scope.",
+      // Y is weight
-                                        op_desc.Input("Y").front().c_str()));
      auto* Y_t = Y_v->GetMutable<framework::LoDTensor>();
-    float* weight_data = nullptr;
+      float* weight_data =
-    auto output_name = op_desc.Output("Out")[0];
+          engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t);
-    weight_data = engine_->GetWeightCPUData(op_desc.Input("Y").front(), Y_t);
+      std::vector<int> dims_y = phi::vectorize<int>(Y_t->dims());
-    nvinfer1::Dims dims_x = X->getDimensions();
+      TensorRTEngine::Weight y_weight{nvinfer1::DataType::kFLOAT,
-    auto regist_eltwise_weight = [&](nvinfer1::ScaleMode scale_mode) {
-      TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT,
                                      static_cast<void*>(weight_data),
                                      static_cast<size_t>(Y_t->numel())};
-      TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, nullptr,
+      nvinfer1::Dims trt_dims_y;
-                                           0};
+      trt_dims_y.nbDims = dims_y.size();
-      TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr,
+      for (int i = 0; i < trt_dims_y.nbDims; i++) {
-                                           0};
+        trt_dims_y.d[i] = dims_y[i];
-      nvinfer1::IShuffleLayer* expand_layer = nullptr;
-      nvinfer1::IShuffleLayer* squeeze_layer = nullptr;
-      int dynamic_shape_offset = engine_->with_dynamic_shape() ? 1 : 0;
-      auto input_dim = X->getDimensions();
-      if (input_dim.nbDims < 3 + dynamic_shape_offset) {
-        nvinfer1::Dims expand_shape;
-        expand_shape.nbDims = 3 + dynamic_shape_offset;
-        for (int i = 0; i < expand_shape.nbDims; i++) {
-          if (i < input_dim.nbDims) {
-            expand_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
-          } else {
-            expand_shape.d[i] = 1;
-          }
      }
-        expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+      Y = TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims_y, y_weight.get())
-        expand_layer->setReshapeDimensions(expand_shape);
+              ->getOutput(0);
-        X = expand_layer->getOutput(0);
-        expand_layer->getOutput(0)->setName(
-            ("elementwise_reshape_out: " + output_name).c_str());
-        expand_layer->setName(
-            ("Elewise: Shuffle: (Output: " + output_name + ")").c_str());
-      }
-      if (op_type_ == "add") {
-        nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, ScaleNd, *X, scale_mode, shift_weights.get(),
-            scale_weights.get(), power_weights.get(), dynamic_shape_offset);
-        layer = scale_layer;
-      } else if (op_type_ == "mul") {
-        nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Scale, *X, scale_mode, scale_weights.get(),
-            shift_weights.get(), power_weights.get());
-        layer = scale_layer;
-      }
-      if (input_dim.nbDims < 3 + dynamic_shape_offset) {
-        nvinfer1::Dims squeeze_shape;
-        squeeze_shape.nbDims = input_dim.nbDims;
-        for (int i = 0; i < squeeze_shape.nbDims; i++) {
-          squeeze_shape.d[i] = input_dim.d[i] < 0 ? 0 : input_dim.d[i];
-        }
-        squeeze_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0)));
-        squeeze_layer->setReshapeDimensions(squeeze_shape);
-        RreplenishLayerAndOutput(squeeze_layer, "elementwise_" + op_type_,
-                                 {output_name}, test_mode);
    } else {
-        RreplenishLayerAndOutput(layer, "elementwise_" + op_type_,
+      Y = engine_->GetITensor(op_desc.Input("Y").front());
-                                 {output_name}, test_mode);
    }
-    };
-    if (engine_->with_dynamic_shape()) {
+    if (X->getDimensions().nbDims < Y->getDimensions().nbDims) {
-      if (Y_t->dims().size() == 1) {
+      auto* tmp = X;
-        auto scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+      X = Y;
-        PADDLE_ENFORCE_EQ(Y_t->dims()[0], dims_x.d[1],
+      Y = tmp;
-                          platform::errors::InvalidArgument(
-                              "The Bias's size(%d) should be equal to the "
-                              "first dim(%d) of the Input.",
-                              Y_t->dims()[0], dims_x.d[1]));
-        regist_eltwise_weight(scale_mode);
-      } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
-            "The size of input bias's dims is %d, but TensorRT dynamic shape "
-            "only support size = 1 for Elementwise op!",
-            Y_t->dims().size()));
-      }
-      return;
-    }
-    std::vector<int> no_batch_dims;
-    int start_index = 0;
-    for (; start_index < dims_x.nbDims; start_index++)
-      no_batch_dims.push_back(dims_x.d[start_index]);
-    auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-    std::vector<int> dims_y = phi::vectorize<int>(Y_t->dims());
-    if (dims_y.size() == no_batch_dims.size() + 1) {
-      if (dims_y[0] == 1) dims_y.erase(dims_y.begin());
    }
+    nvinfer1::Dims dims_x = X->getDimensions();
+    nvinfer1::Dims dims_y = Y->getDimensions();
+    auto output_name = op_desc.Output("Out")[0];
-    if (dims_y.size() == 1 && dims_y[0] == no_batch_dims[0]) {
+    // axis here is relative to explicit batch
-      scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+    int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    } else if (dims_y.size() == no_batch_dims.size() &&
+    int real_x_rank = dims_x.nbDims;
-               dims_y[0] == no_batch_dims[0]) {
+    int real_y_rank = dims_y.nbDims;
-      scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
+    if (!engine_->with_dynamic_shape()) {
-      for (size_t i = 1; i < no_batch_dims.size(); i++) {
+      real_x_rank++;
-        if (dims_y[i] != no_batch_dims[i]) {
+      real_y_rank++;
-          scale_mode = nvinfer1::ScaleMode::kCHANNEL;
+      if (Y_v) real_y_rank--;
-          break;
+    }
-        }
+    if (axis == -1) {
-      }
+      axis = real_x_rank - real_y_rank;
-      if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
+    }
-        for (size_t i = 1; i < no_batch_dims.size(); i++) {
+    if (!engine_->with_dynamic_shape() && axis > 0) {
-          if (dims_y[i] != 1)
+      axis--;
-            PADDLE_THROW(platform::errors::InvalidArgument(
+    }
-                "The bias's %d dim is %d, but TensorRT dynamic shape only "
-                "support it equals to 1 for Elementwise op!",
+    // X: - -  -    - - - -
-                i, dims_y[i]));
+    //        axis
-        }
+    // Y:      -    - -
-      }
+    // we need expand Y's rank = X's rank
+    int left_one_num = axis;
+    int right_one_num = dims_x.nbDims - axis - dims_y.nbDims;
+    nvinfer1::IShuffleLayer* reshape_layer;
+    nvinfer1::ITensor* reshape_y_tensor;
+    if (left_one_num > 0 || right_one_num > 0) {
+      if (engine_->with_dynamic_shape()) {
+        auto* y_shape_tensor = Shape(Y);
+        auto* new_y_shape_tensor = y_shape_tensor;
+        if (axis > 0) {
+          std::vector<int32_t> left_one(left_one_num, 1);
+          auto* left_one_tensor = Add1DConstantLayer(left_one);
+          new_y_shape_tensor = Concat(std::vector<nvinfer1::ITensor*>{
+              left_one_tensor, new_y_shape_tensor});
+        }
+        if (right_one_num > 0) {
+          std::vector<int32_t> right_one(right_one_num, 1);
+          auto* right_one_tensor = Add1DConstantLayer(right_one);
+          new_y_shape_tensor = Concat(std::vector<nvinfer1::ITensor*>{
+              new_y_shape_tensor, right_one_tensor});
+        }
+        reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *Y);
+        reshape_layer->setInput(1, *new_y_shape_tensor);
      } else {
-      if (dims_y.size() >= 1) {
+        nvinfer1::Dims new_y_dims;
-        PADDLE_THROW(platform::errors::InvalidArgument(
+        new_y_dims.nbDims = left_one_num + dims_y.nbDims + right_one_num;
-            "The size of bias's dims is %d and bias's size is %d. TensorRT "
+        for (int i = 0; i < new_y_dims.nbDims; i++) new_y_dims.d[i] = 1;
-            "doesn't support this shape for Elementwise op!",
+        for (int i = 0; i < dims_y.nbDims; i++)
-            dims_y.size(), dims_y[0]));
+          new_y_dims.d[left_one_num + i] = dims_y.d[i];
+        reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *Y);
+        reshape_layer->setReshapeDimensions(new_y_dims);
+      }
+      reshape_y_tensor = reshape_layer->getOutput(0);
    } else {
-        PADDLE_THROW(platform::errors::InvalidArgument(
+      // In fact , we can remove this `else`, but -> rt_resnet50_test CI in trt
-            "The size of bias's dims is %d. TensorRT doesn't support "
+      // 6015 faling, how ridiculous！
-            "this shape for Elementwise op!",
+      reshape_y_tensor = Y;
-            dims_y.size()));
-      }
-    }
-    regist_eltwise_weight(scale_mode);
    }
- protected:
-  std::string op_type_;
-};
-class ElementwiseTensorOpConverter : public OpConverter {
- public:
-  ElementwiseTensorOpConverter() {}
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
    auto op_pair = ops.find(op_type_);
-    PADDLE_ENFORCE_NE(op_pair, ops.end(),
+    PADDLE_ENFORCE_NE(op_pair,
+                      ops.end(),
                      platform::errors::InvalidArgument(
                          "Elementwise op's type(%s) is not supported. Please "
                          "check if the op_type is correct.",
                          op_type_));
-    // Here the two nullptr looks strange, that's because the
+    auto* layer = TRT_ENGINE_ADD_LAYER(
-    // framework::OpDesc's constructor is strange.
+        engine_, ElementWise, *X, *reshape_y_tensor, op_pair->second);
-    framework::OpDesc op_desc(op, nullptr);
-    nvinfer1::ILayer* layer = nullptr;
-    auto* X = engine_->GetITensor(op_desc.Input("X").front());
-    auto* Y = engine_->GetITensor(op_desc.Input("Y").front());
-    std::vector<nvinfer1::ITensor*> itensors;
-    itensors.push_back(X);
-    itensors.push_back(Y);
-    nvinfer1::Dims dims_x = X->getDimensions();
-    nvinfer1::Dims dims_y = Y->getDimensions();
-    int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis"));
-    auto output_name = op_desc.Output("Out")[0];
-    auto common_func = [&](nvinfer1::ILayer* layer) {
    RreplenishLayerAndOutput(layer, "elementwise", {output_name}, test_mode);
-    };
-    if (dims_x.nbDims == dims_y.nbDims) {
-      // The two input tensor should have the same dims
-      VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-      nvinfer1::IElementWiseLayer* elet_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, ElementWise, *X, *Y, op_pair->second);
-      layer = elet_layer;
-    } else {
-      VLOG(3) << "Convert a fluid elementwise op to TensorRT "
-                 "ElementWisePluginLayer";
-      if (engine_->with_dynamic_shape()) {
-#if IS_TRT_VERSION_GE(6000)
-        plugin::ElementwisePluginDynamic* plugin =
-            new plugin::ElementwisePluginDynamic(op_type_, axis);
-        layer = engine_->AddDynamicPlugin(itensors.data(), 2, plugin);
-#else
-        PADDLE_THROW(platform::errors::Fatal(
-            "You are running the TRT Dynamic Shape mode, need to confirm that "
-            "your TRT version is no less than 6.0"));
-#endif
-      } else {
-        plugin::ElementWisePlugin* plugin =
-            new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
-        std::vector<nvinfer1::ITensor*> inputs{X, Y};
-        auto* plugin_layer = engine_->AddPlugin(
-            inputs.data(), inputs.size(),
-            reinterpret_cast<plugin::PluginTensorRT*>(plugin));
-        layer = plugin_layer;
-      }
-    }
-    common_func(layer);
  }
 protected:
@@ -268,16 +147,6 @@ const std::unordered_map<std::string, nvinfer1::ElementWiseOperation>
        {"max", nvinfer1::ElementWiseOperation::kMAX},
 };
-class ElementwiseWeightAddOpConverter : public ElementwiseWeightOpConverter {
- public:
-  ElementwiseWeightAddOpConverter() { op_type_ = "add"; }
-};
-class ElementwiseWeightMulOpConverter : public ElementwiseWeightOpConverter {
- public:
-  ElementwiseWeightMulOpConverter() { op_type_ = "mul"; }
-};
 class ElementwiseTensorAddOpConverter : public ElementwiseTensorOpConverter {
 public:
  ElementwiseTensorAddOpConverter() { op_type_ = "add"; }
@@ -318,9 +187,15 @@ class ElementwiseTensorPowOpConverter : public ElementwiseTensorOpConverter {
 }  // namespace paddle
 REGISTER_TRT_OP_CONVERTER(elementwise_add_weight,
-                          ElementwiseWeightAddOpConverter);
+                          ElementwiseTensorAddOpConverter);
 REGISTER_TRT_OP_CONVERTER(elementwise_mul_weight,
-                          ElementwiseWeightMulOpConverter);
+                          ElementwiseTensorMulOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_sub_weight,
+                          ElementwiseTensorSubOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_div_weight,
+                          ElementwiseTensorDivOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_pow_weight,
+                          ElementwiseTensorPowOpConverter);
 REGISTER_TRT_OP_CONVERTER(elementwise_add_tensor,
                          ElementwiseTensorAddOpConverter);

--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
@@ -46,14 +47,16 @@ class OpConverter {
  // test_mode: whether the instance executes in an unit test.
  void ConvertOp(const framework::proto::OpDesc& op,
                 const std::unordered_set<std::string>& parameters,
-                 const framework::Scope& scope, TensorRTEngine* engine,
+                 const framework::Scope& scope,
+                 TensorRTEngine* engine,
                 bool test_mode = false) {
    framework::OpDesc op_desc(op, nullptr);
    OpConverter* it{nullptr};
    if (op_desc.Type() == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
+                        1UL,
                        platform::errors::InvalidArgument(
                            "The input op mul's Input(\"Y\")."
                            "size() should equal to 1, but reveceid "
@@ -67,11 +70,10 @@ class OpConverter {
    if (op_desc.Type().find("elementwise") != std::string::npos) {
      static std::unordered_set<std::string> add_tensor_op_set{
          "add", "mul", "sub", "div", "max", "min", "pow"};
-      // TODO(xingzhaolong): all mul, sub, div
+      static std::unordered_set<std::string> add_weight_op_set{
-      // static std::unordered_set<std::string> add_weight_op_set {"add", "mul",
+          "add", "mul", "sub", "div", "pow"};
-      // "sub", "div"};
+      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
-      static std::unordered_set<std::string> add_weight_op_set{"add", "mul"};
+                        1UL,
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(), 1UL,
                        platform::errors::InvalidArgument(
                            "The input op's Input(\"Y\")."
                            "size() should equal to 1, but reveceid "
@@ -82,63 +84,73 @@ class OpConverter {
      std::string Y = op_desc.Input("Y")[0];
      if (parameters.count(Y)) {
        PADDLE_ENFORCE_GT(
-            add_weight_op_set.count(op_type), 0,
+            add_weight_op_set.count(op_type),
+            0,
            platform::errors::Unimplemented("Unsupported elementwise type %s",
                                            op_type.c_str()));
        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                    "_weight");
        PADDLE_ENFORCE_NOT_NULL(
-            it, platform::errors::Unimplemented(
+            it,
-                    "no OpConverter for optype [%s]", op_desc.Type()));
+            platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                            op_desc.Type()));
      } else {
        PADDLE_ENFORCE_GT(
-            add_tensor_op_set.count(op_type), 0,
+            add_tensor_op_set.count(op_type),
+            0,
            platform::errors::Unimplemented("Unsupported elementwise type %s",
                                            op_type.c_str()));
        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
                                                    "_tensor");
      }
      PADDLE_ENFORCE_NOT_NULL(
-          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+          it,
+          platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                          op_desc.Type()));
    }
    if (op_desc.Type() == "depthwise_conv2d") {
      it = Registry<OpConverter>::Global().Lookup("conv2d");
      PADDLE_ENFORCE_NOT_NULL(
-          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+          it,
+          platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                          op_desc.Type()));
    }
    if (op_desc.Type() == "depthwise_conv2d_transpose") {
      it = Registry<OpConverter>::Global().Lookup("conv2d_transpose");
      PADDLE_ENFORCE_NOT_NULL(
-          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+          it,
+          platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                          op_desc.Type()));
    }
    if (op_desc.Type() == "transpose2") {
      it = Registry<OpConverter>::Global().Lookup("transpose");
      PADDLE_ENFORCE_NOT_NULL(
-          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+          it,
+          platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                          op_desc.Type()));
    }
    if (op_desc.Type() == "flatten2") {
      it = Registry<OpConverter>::Global().Lookup("flatten");
      PADDLE_ENFORCE_NOT_NULL(
-          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+          it,
+          platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                          op_desc.Type()));
    }
    // reshape2 == reshape
    if (op_desc.Type() == "reshape2") {
      it = Registry<OpConverter>::Global().Lookup("reshape");
      PADDLE_ENFORCE_NOT_NULL(
-          it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+          it,
+          platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                          op_desc.Type()));
    }
    if (!it) {
      it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
    }
    PADDLE_ENFORCE_NOT_NULL(
-        it, platform::errors::Unimplemented("no OpConverter for optype [%s]",
+        it,
+        platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                        op_desc.Type()));
    it->SetEngine(engine);
@@ -215,7 +227,8 @@ class OpConverter {
  // the INetwork's inputs and outputs should specified in some other modules.
  void ConvertBlock(const framework::proto::BlockDesc& block,
                    const std::unordered_set<std::string>& parameters,
-                    const framework::Scope& scope, TensorRTEngine* engine) {
+                    const framework::Scope& scope,
+                    TensorRTEngine* engine) {
    std::unique_lock<std::mutex> lk(mut_);
    for (int i = 0; i < block.ops_size(); i++) {
      const auto& op = block.ops(i);
@@ -225,20 +238,24 @@ class OpConverter {
  // The scope  here should be inited with the parameter vars.
  void ConvertBlockToTRTEngine(
-      framework::BlockDesc* block_desc, const framework::Scope& scope,
+      framework::BlockDesc* block_desc,
+      const framework::Scope& scope,
      const std::vector<std::string>& inputs,
      const std::unordered_set<std::string>& parameters,
-      const std::vector<std::string>& outputs, TensorRTEngine* engine) {
+      const std::vector<std::string>& outputs,
+      TensorRTEngine* engine) {
    engine->InitNetwork();
    bool all_dynamic_shape_set = true;
    for (auto& input : inputs) {
      if (parameters.count(input)) continue;
      auto* var = block_desc->FindVar(input);
      PADDLE_ENFORCE_NOT_NULL(
-          var, platform::errors::NotFound("no variable called %s in block.",
+          var,
+          platform::errors::NotFound("no variable called %s in block.",
                                     input.c_str()));
      PADDLE_ENFORCE_EQ(
-          var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+          var->GetType(),
+          FluidDT::VarType_Type_LOD_TENSOR,
          platform::errors::InvalidArgument("TensorRT engine only takes "
                                            "LoDTensor as input"));
      auto var_shape = var->GetShape();
@@ -263,25 +280,29 @@ class OpConverter {
          } else {
            input_shape.push_back(min_input_shape[i]);
            // the i dimension should be same.
-            PADDLE_ENFORCE_EQ(min_input_shape[i], optim_input_shape[i],
+            PADDLE_ENFORCE_EQ(min_input_shape[i],
+                              optim_input_shape[i],
                              platform::errors::InvalidArgument(
                                  "The dim (%d) of the min_input_shape and "
                                  "optim_input_shape should be same."));
          }
        }
        engine->DeclareInput(
-            input, FluidDataType2TRT(
+            input,
+            FluidDataType2TRT(
                var->Proto()->type().lod_tensor().tensor().data_type()),
            Vec2TRT_Dims(input_shape, input, true));
 #endif
      } else {
        engine->DeclareInput(
-            input, FluidDataType2TRT(
+            input,
+            FluidDataType2TRT(
                var->Proto()->type().lod_tensor().tensor().data_type()),
            Vec2TRT_Dims(var_shape, input));
      }
    }
-    PADDLE_ENFORCE_EQ(all_dynamic_shape_set, true,
+    PADDLE_ENFORCE_EQ(all_dynamic_shape_set,
+                      true,
                      platform::errors::InvalidArgument(
                          "some trt inputs dynamic shape info not set, "
                          "check the INFO log above for more details."));
@@ -294,20 +315,221 @@ class OpConverter {
    engine->ClearWeights();
  }
+  // rank(result) = rank(input)
+  nvinfer1::ITensor* Gather(nvinfer1::ITensor* input,
+                            const std::vector<int32_t> indices,
+                            int axis = 0) {
+    auto* indices_tensor = Add1DConstantLayer(indices, " ");
+    auto* result =
+        TRT_ENGINE_ADD_LAYER(engine_, Gather, *input, *indices_tensor, axis)
+            ->getOutput(0);
+    return result;
+  }
+  // paddle allows negative index
+  // for axis length = 5, paddle allows [-5, 4]
+  nvinfer1::ITensor* FixNegIndices(nvinfer1::ITensor* input_shape,
+                                   nvinfer1::ITensor* indices) {
+    int rank = input_shape->getDimensions().nbDims;
+    std::vector<int32_t> zero = std::vector<int32_t>(rank, 0);
+    std::vector<int32_t> minus_one = std::vector<int32_t>(rank, -1);
+    nvinfer1::ITensor* zero_tensor = Add1DConstantLayer(zero);
+    nvinfer1::ITensor* minus_one_tensor = Add1DConstantLayer(minus_one);
+    // -1, 0
+    auto* sign = Max(Min(indices, zero_tensor), minus_one_tensor);
+    return Sub(indices, Prod(sign, input_shape));
+  }
+  nvinfer1::ITensor* Shape(nvinfer1::ITensor* input) {
+    return TRT_ENGINE_ADD_LAYER(engine_, Shape, *input)->getOutput(0);
+  }
+  // Concat not make rank changed
+  nvinfer1::ITensor* Concat(const std::vector<nvinfer1::ITensor*>& inputs,
+                            int axis = 0) {
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, Concatenation, inputs.data(), inputs.size());
+    if (axis != 0) layer->setAxis(axis);
+    nvinfer1::ITensor* c = layer->getOutput(0);
+    return c;
+  }
+  nvinfer1::ITensor* Sum(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(
+            engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kSUM)
+            ->getOutput(0);
+    return c;
+  }
+  nvinfer1::ITensor* Prod(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(
+            engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kPROD)
+            ->getOutput(0);
+    return c;
+  }
+  nvinfer1::ITensor* Min(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(
+            engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kMIN)
+            ->getOutput(0);
+    return c;
+  }
+  nvinfer1::ITensor* Max(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(
+            engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kMAX)
+            ->getOutput(0);
+    return c;
+  }
+  nvinfer1::ITensor* Sub(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(
+            engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kSUB)
+            ->getOutput(0);
+    return c;
+  }
+  nvinfer1::ITensor* Div(nvinfer1::ITensor* a, nvinfer1::ITensor* b) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(
+            engine_, ElementWise, *a, *b, nvinfer1::ElementWiseOperation::kDIV)
+            ->getOutput(0);
+    return c;
+  }
+  nvinfer1::ITensor* Act(nvinfer1::ITensor* a,
+                         nvinfer1::ActivationType act_type) {
+    nvinfer1::ITensor* c =
+        TRT_ENGINE_ADD_LAYER(engine_, Activation, *a, act_type)->getOutput(0);
+    return c;
+  }
+  // Get element tensor of 1D shape tensor
+  nvinfer1::ITensor* GetEleTensorOfShape(nvinfer1::ITensor* shape_tensor,
+                                         int index,
+                                         bool is_scalar = false) {
+    auto* tensor =
+        TRT_ENGINE_ADD_LAYER(engine_,
+                             Gather,
+                             *shape_tensor,
+                             *Add1DConstantLayer(index, " ", is_scalar),
+                             0)
+            ->getOutput(0);
+    return tensor;
+  }
+  // Create and add Multi-D constant float layer
+  nvinfer1::ITensor* AddConstantLayer(const float* data,
+                                      const std::vector<int32_t>& weight_dims,
+                                      const std::string& weight_name) {
+    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+    int data_size = std::accumulate(
+        weight_dims.begin(), weight_dims.end(), 1, std::multiplies<int>());
+    tmp_tensor->Resize({data_size});
+    auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+    for (int i = 0; i < data_size; i++) {
+      tmp_data[i] = data[i];
+    }
+    engine_->SetWeights(weight_name, std::move(tmp_tensor));
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(tmp_data),
+                                  static_cast<size_t>(data_size)};
+    nvinfer1::Dims trt_dims;
+    trt_dims.nbDims = weight_dims.size();
+    for (size_t i = 0; i < weight_dims.size(); i++)
+      trt_dims.d[i] = weight_dims[i];
+    auto const_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_dims, weight.get());
+    return const_layer->getOutput(0);
+  }
+  // Create and add 1D constant float layer
+  nvinfer1::ITensor* Add1DConstantLayer(const std::vector<float>& data,
+                                        const std::string& weight_name = "",
+                                        bool scalar = false) {
+    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+    int data_size = data.size();
+    tmp_tensor->Resize({data_size});
+    auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+    for (int i = 0; i < data_size; i++) {
+      tmp_data[i] = data[i];
+    }
+    engine_->SetWeights(weight_name, std::move(tmp_tensor));
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                  static_cast<void*>(tmp_data),
+                                  static_cast<size_t>(data_size)};
+    nvinfer1::Dims input_shape;
+    input_shape.nbDims = scalar ? 0 : 1;
+    input_shape.d[0] = data_size;
+    auto const_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
+    return const_layer->getOutput(0);
+  }
+  // Create and add 1D constant layer
+  nvinfer1::ITensor* Add1DConstantLayer(const std::vector<int>& data,
+                                        const std::string& weight_name = "",
+                                        bool scalar = false) {
+    std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+    int data_size = data.size();
+    tmp_tensor->Resize({data_size});
+    auto* tmp_data = tmp_tensor->mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < data_size; i++) {
+      tmp_data[i] = data[i];
+    }
+    engine_->SetWeights(weight_name, std::move(tmp_tensor));
+    TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32,
+                                  static_cast<void*>(tmp_data),
+                                  static_cast<size_t>(data_size)};
+    nvinfer1::Dims input_shape;
+    input_shape.nbDims = scalar ? 0 : 1;
+    input_shape.d[0] = data_size;
+    auto const_layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
+    return const_layer->getOutput(0);
+  }
+  nvinfer1::ITensor* Add1DConstantLayer(nvinfer1::Dims data,
+                                        const std::string& weight_name = "",
+                                        bool scalar = false) {
+    std::vector<int> tmp_data;
+    for (int i = 0; i < data.nbDims; i++) tmp_data.push_back(data.d[i]);
+    return Add1DConstantLayer(tmp_data, weight_name, scalar);
+  }
+  nvinfer1::ITensor* Add1DConstantLayer(int32_t data,
+                                        const std::string& weight_name = "",
+                                        bool scalar = false) {
+    std::vector<int> tmp_data;
+    tmp_data.push_back(data);
+    return Add1DConstantLayer(tmp_data, weight_name, scalar);
+  }
  void RreplenishLayerAndOutput(
-      nvinfer1::ILayer* layer, const std::string& layer_type,
+      nvinfer1::ILayer* layer,
+      const std::string& layer_type,
      const std::vector<std::string>& output_tensor_names,
      bool test_mode = false) {
    size_t num_out = output_tensor_names.size();
+    std::string layer_name = layer_type + " (Output: ";
    for (size_t i = 0; i < num_out; i++) {
      layer->getOutput(i)->setName(output_tensor_names[i].c_str());
      engine_->SetITensor(output_tensor_names[i], layer->getOutput(i));
      if (test_mode) {
        engine_->DeclareOutput(output_tensor_names[i]);
      }
+      layer_name += output_tensor_names[i];
+      if (i != num_out - 1) layer_name += ", ";
    }
-    layer->setName(
+    layer->setName((layer_name + ")").c_str());
-        (layer_type + " (Output: " + output_tensor_names[0] + ")").c_str());
  }
  void SetEngine(TensorRTEngine* engine) { engine_ = engine; }

--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -66,13 +66,16 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
 // The T can be int32 or int64 type.
 template <typename T>
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape,
+                            std::string input,
                            bool with_dynamic_shape = false) {
-  PADDLE_ENFORCE_GT(shape.size(), 0UL,
+  PADDLE_ENFORCE_GT(shape.size(),
+                    0UL,
                    platform::errors::InvalidArgument(
                        "TensorRT's tensor input requires at least 1 "
                        "dimensions, but input %s has %d dims.",
-                        input, shape.size()));
+                        input,
+                        shape.size()));
  auto ShapeStr = [](const std::vector<T>& shape) {
    std::ostringstream os;
@@ -93,7 +96,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
        PADDLE_THROW(platform::errors::InvalidArgument(
            "The input [%s] shape of trt subgraph is %s, please enable "
            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
-            input, ShapeStr(shape)));
+            input,
+            ShapeStr(shape)));
      }
      return nvinfer1::Dims3(shape[1], shape[2], shape[3]);
    } else if (shape.size() == 5UL) {
@@ -101,7 +105,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
        PADDLE_THROW(platform::errors::InvalidArgument(
            "The input [%s] shape of trt subgraph is %s, please enable "
            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
-            input, ShapeStr(shape)));
+            input,
+            ShapeStr(shape)));
      }
      return nvinfer1::Dims4(shape[1], shape[2], shape[3], shape[4]);
    } else if (shape.size() == 3UL) {
@@ -109,7 +114,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
        PADDLE_THROW(platform::errors::InvalidArgument(
            "The input [%s] shape of trt subgraph is %s, please enable "
            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
-            input, ShapeStr(shape)));
+            input,
+            ShapeStr(shape)));
      }
      return nvinfer1::Dims2(shape[1], shape[2]);
    } else if (shape.size() == 2UL) {
@@ -117,7 +123,8 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
        PADDLE_THROW(platform::errors::InvalidArgument(
            "The input [%s] shape of trt subgraph is %s, please enable "
            "trt dynamic_shape mode by SetTRTDynamicShapeInfo.",
-            input, ShapeStr(shape)));
+            input,
+            ShapeStr(shape)));
      }
      nvinfer1::Dims dims;
      dims.nbDims = 1;
@@ -125,11 +132,13 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
      return dims;
    }
    // static shape doesn't support 1D op so far.
-    PADDLE_ENFORCE_NE(shape.size(), 1UL,
+    PADDLE_ENFORCE_NE(shape.size(),
+                      1UL,
                      platform::errors::InvalidArgument(
                          "The input [%s] shape of trt subgraph is %s."
                          "it's not supported by trt so far",
-                          input, ShapeStr(shape)));
+                          input,
+                          ShapeStr(shape)));
    nvinfer1::Dims dims;
    dims.nbDims = shape.size() - 1;
@@ -151,7 +160,7 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
    return dims;
  }
 }
-}  // NOLINT
+}  // namespace
 class TRTInt8Calibrator;
@@ -184,9 +193,11 @@ class TensorRTEngine {
  };
  TensorRTEngine(
-      int max_batch, int max_workspace,
+      int max_batch,
+      int max_workspace,
      AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32,
-      TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
+      TRTInt8Calibrator* calibrator = nullptr,
+      int device_id = 0,
      const ShapeMapType min_input_shape = {},
      const ShapeMapType max_input_shape = {},
      const ShapeMapType optim_input_shape = {},
@@ -205,17 +216,21 @@ class TensorRTEngine {
    if (min_input_shape_.size() != 0 && max_input_shape_.size() != 0 &&
        optim_input_shape_.size() != 0) {
      PADDLE_ENFORCE_EQ(
-          min_input_shape_.size(), max_input_shape_.size(),
+          min_input_shape_.size(),
+          max_input_shape_.size(),
          platform::errors::InvalidArgument(
              "The min_input_shape_'s size(%d) should be equal to the "
              "size(%d) of max_input_shape_",
-              min_input_shape_.size(), max_input_shape_.size()));
+              min_input_shape_.size(),
+              max_input_shape_.size()));
      PADDLE_ENFORCE_EQ(
-          min_input_shape_.size(), optim_input_shape_.size(),
+          min_input_shape_.size(),
+          optim_input_shape_.size(),
          platform::errors::InvalidArgument(
              "The min_input_shape_'s size(%d) should be equal to the "
              "size(%d) of optim_input_shape_",
-              min_input_shape_.size(), optim_input_shape_.size()));
+              min_input_shape_.size(),
+              optim_input_shape_.size()));
 #if IS_TRT_VERSION_GE(6000)
      with_dynamic_shape_ = true;
 #else
@@ -242,7 +257,8 @@ class TensorRTEngine {
                                  const nvinfer1::Dims& dim);
  // Set the offset-th output from a layer as the network's output, and set its
  // name.
-  void DeclareOutput(const nvinfer1::ILayer* layer, int offset,
+  void DeclareOutput(const nvinfer1::ILayer* layer,
+                     int offset,
                     const std::string& name);
  // Set the itensor_map_[name] as the network's output, and set its name.
  void DeclareOutput(const std::string& name);
@@ -374,7 +390,8 @@ class TensorRTEngine {
  int GetDeviceId() { return device_id_; }
  nvinfer1::IPluginV2Layer* AddPlugin(nvinfer1::ITensor* const* inputs,
-                                      int num_inputs, plugin::PluginTensorRT*);
+                                      int num_inputs,
+                                      plugin::PluginTensorRT*);
  nvinfer1::IPluginV2Layer* AddPluginV2Ext(nvinfer1::ITensor* const* inputs,
                                           int num_inputs,
@@ -431,7 +448,8 @@ class TensorRTEngine {
  // After finishing adding ops, freeze this network and creates the execution
  // environment.
  void FreezeNetwork();
-  void Execute(int batch_size, std::vector<void*>* buffers,
+  void Execute(int batch_size,
+               std::vector<void*>* buffers,
               cudaStream_t stream = nullptr);
  nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
@@ -448,15 +466,20 @@ class TensorRTEngine {
      auto name = it.first;
      auto input_shape = it.second;
      PADDLE_ENFORCE_EQ(
-          min_input_shape_.count(name), true,
+          min_input_shape_.count(name),
+          true,
          platform::errors::InvalidArgument(
              "TRT dynamic_shape min_input_shape %s not found.", name));
-      PADDLE_ENFORCE_EQ(min_input_shape_[name].size(), input_shape.size(),
+      PADDLE_ENFORCE_EQ(min_input_shape_[name].size(),
+                        input_shape.size(),
                        platform::errors::InvalidArgument(
                            "TRT dynamic_shape min_input_shape %s size not "
                            "equal, the min_input_shape[%s].size()=%d"
                            ", but the runtime_input_shape[%s].size()=%d.",
-                            name, name, min_input_shape_[name].size(), name,
+                            name,
+                            name,
+                            min_input_shape_[name].size(),
+                            name,
                            input_shape.size()));
      auto bak_min_shape = min_input_shape_[name];
      auto bak_max_shape = max_input_shape_[name];
@@ -497,7 +520,8 @@ class TensorRTEngine {
 #if IS_TRT_VERSION_GE(6000)
  nvinfer1::IPluginV2Layer* AddDynamicPlugin(
-      nvinfer1::ITensor* const* inputs, int num_inputs,
+      nvinfer1::ITensor* const* inputs,
+      int num_inputs,
      plugin::DynamicPluginTensorRT* plugin) {
    owned_pluginv2_.emplace_back(plugin);
    return network()->addPluginV2(inputs, num_inputs, *plugin);
@@ -524,7 +548,8 @@ class TensorRTEngine {
  void Set(const std::string& attr_name, AttrType* attr) {
    if (attrs_.count(attr_name) == 0) {
      PADDLE_ENFORCE_EQ(
-          attrs_.count(attr_name), 0,
+          attrs_.count(attr_name),
+          0,
          platform::errors::AlreadyExists(
              "Attribute %s already set in trt engine.", attr_name));
    } else {
@@ -543,7 +568,8 @@ class TensorRTEngine {
  template <typename AttrType>
  void SetNotOwned(const std::string& attr_name, AttrType* attr) {
    PADDLE_ENFORCE_EQ(
-        attrs_.count(attr_name), 0,
+        attrs_.count(attr_name),
+        0,
        platform::errors::AlreadyExists(
            "Attribute %s already set in trt engine.", attr_name));
    attrs_[attr_name] = attr;
@@ -552,7 +578,8 @@ class TensorRTEngine {
  // Get a reference to the attributed previously set.
  template <typename AttrType>
  AttrType& Get(const std::string& attr_name) const {
-    PADDLE_ENFORCE_NE(attrs_.find(attr_name), attrs_.end(),
+    PADDLE_ENFORCE_NE(attrs_.find(attr_name),
+                      attrs_.end(),
                      platform::errors::InvalidArgument(
                          "Attribute %s not found in trt engine.", attr_name));
    try {
@@ -574,7 +601,8 @@ class TensorRTEngine {
      };
      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Invalid type for attritube %s, expected: %s, actual: %s.", attr_name,
+          "Invalid type for attritube %s, expected: %s, actual: %s.",
+          attr_name,
          TypeToString(typeid(AttrType*)),
          TypeToString(attrs_.at(attr_name).type())));
    }
@@ -672,7 +700,7 @@ class TensorRTEngine {
 // them, and an macro like this is more extensible when underlying TensorRT
 // library add new layer supports.
 #define TRT_ENGINE_ADD_LAYER(engine__, layer__, ...) \
-  engine__->network()->add##layer__(__VA_ARGS__);
+  engine__->network()->add##layer__(__VA_ARGS__)
 class TRTEngineManager {
 public:
@@ -687,18 +715,27 @@ class TRTEngineManager {
  }
  TensorRTEngine* Create(
-      std::string name, int max_batch, int max_workspace,
+      std::string name,
+      int max_batch,
+      int max_workspace,
      AnalysisConfig::Precision precision = AnalysisConfig::Precision::kFloat32,
-      TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
+      TRTInt8Calibrator* calibrator = nullptr,
+      int device_id = 0,
      const std::map<std::string, std::vector<int>> min_input_shape = {},
      const std::map<std::string, std::vector<int>> max_input_shape = {},
      const std::map<std::string, std::vector<int>> optim_input_shape = {},
      bool disable_trt_plugin_fp16 = false,
      nvinfer1::ILogger& logger = NaiveLogger::Global()) {
-    auto* p =
+    auto* p = new TensorRTEngine(max_batch,
-        new TensorRTEngine(max_batch, max_workspace, precision, calibrator,
+                                 max_workspace,
-                           device_id, min_input_shape, max_input_shape,
+                                 precision,
-                           optim_input_shape, disable_trt_plugin_fp16, logger);
+                                 calibrator,
+                                 device_id,
+                                 min_input_shape,
+                                 max_input_shape,
+                                 optim_input_shape,
+                                 disable_trt_plugin_fp16,
+                                 logger);
    engines_[name].reset(p);
    return p;
  }