From 9d04ef73692f38247e68e121a44bd34f9f28652c Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Tue, 23 Mar 2021 14:00:22 +0800
Subject: [PATCH] fix tensorrt output varible reshape (#31733)

* fix tensorrt output varible reshape

* move padding shape x 1 x 1 in ernie to qkv and fc

* update layer name

* fix softmax when input is dynamic, fc not padding any more

* fix varlen

* move fc x_dim assert to op_teller
---
 .../ir_passes/tensorrt_subgraph_pass.cc       |  8 ++-
 .../fluid/inference/tensorrt/convert/fc_op.cc | 70 +++++++++++++++++--
 .../tensorrt/convert/multihead_matmul_op.cc   | 43 +++++++++---
 .../inference/tensorrt/convert/softmax_op.cc  | 13 ++--
 paddle/fluid/inference/tensorrt/op_teller.cc  | 12 +++-
 .../plugin/emb_eltwise_layernorm_plugin.cu    |  4 +-
 .../tensorrt/plugin/qkv_to_context_plugin.cu  |  4 +-
 .../plugin/skip_layernorm_op_plugin.cu        |  5 --
 .../tensorrt/plugin/special_slice_plugin.cu   |  2 +
 9 files changed, 125 insertions(+), 36 deletions(-)
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 59ed09b96c..60de4234b4 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -168,11 +168,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   std::set<std::string> output_names;
   std::set<std::string> output_names_with_id;
-  std::vector<int> origin_output_dims;
+  std::map<std::string, int> origin_name_output_dims;
   for (auto *x : node->outputs) {
     output_names.insert(x->Name());
     output_names_with_id.insert(x->Name() + std::to_string(x->id()));
-    origin_output_dims.push_back(x->Var()->GetShape().size());
+    origin_name_output_dims[x->Name()] = x->Var()->GetShape().size();
   }
 
   std::unordered_map<std::string, std::string> output_name_map;
@@ -216,11 +216,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // output_mapping help us copy the data from the renamed ITensor
   // to Tensor.
   std::vector<std::string> output_mapping;
+  std::vector<int> renamed_output_dims;
   for (auto name : output_names) {
     PADDLE_ENFORCE_NE(output_name_map.count(name), 0,
                       platform::errors::PreconditionNotMet(
                           "The output_name_map should have %s", name));
     output_mapping.push_back(output_name_map[name]);
+    renamed_output_dims.push_back(origin_name_output_dims[name]);
   }
   PADDLE_ENFORCE_EQ(output_mapping.empty(), false,
                     platform::errors::PreconditionNotMet(
@@ -243,7 +245,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
   op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
   op_desc->SetAttr("output_name_mapping", output_mapping);
-  op_desc->SetAttr("origin_output_dims", origin_output_dims);
+  op_desc->SetAttr("origin_output_dims", renamed_output_dims);
   op_desc->SetAttr("parameters", params);
 
   // we record all inputs' shapes in attr to check if they are consistent
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index 41fbbb557d..527d0ee208 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -144,7 +144,69 @@ class FcOpConverter : public OpConverter {
                                 static_cast<size_t>(bias_num)};
 
     if (engine_->with_dynamic_shape()) {
-      regist_fc(X, n_output, weight, bias);
+      // not NCHW layout, but NLP layout with added 'x 1 x 1'
+      auto x_dim = X->getDimensions();
+      if (x_dim.nbDims == 3 || x_dim.nbDims == 2) {
+        auto output_name = op_desc.Output("Out").front();
+        // add shuffle before fc
+        nvinfer1::Dims reshape_before_fc_dim;
+        reshape_before_fc_dim.nbDims = x_dim.nbDims + 2;
+        for (int i = 0; i < x_dim.nbDims; i++) {
+          reshape_before_fc_dim.d[i] = 0;
+        }
+        reshape_before_fc_dim.d[x_dim.nbDims] = 1;
+        reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1;
+        auto* reshape_before_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X);
+        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+        reshape_before_fc_layer->setName(
+            ("shuffle_before_fc(Output: " + output_name + ")").c_str());
+
+        // add fc layer
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0),
+            n_output, weight.get(), bias.get());
+        fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str());
+
+        // add shuffle after fc
+        nvinfer1::Dims reshape_after_fc_dim;
+        if (x_dim.nbDims == 3) {
+          if (x_num_col_dims == 2) {
+            reshape_after_fc_dim.nbDims = 3;
+            reshape_after_fc_dim.d[0] = 0;
+            reshape_after_fc_dim.d[1] = 0;
+            reshape_after_fc_dim.d[2] = 0;
+          } else {
+            reshape_after_fc_dim.nbDims = 2;
+            reshape_after_fc_dim.d[0] = 0;
+            auto dim = fc_layer->getOutput(0)->getDimensions();
+            reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2];
+          }
+          // x_dim.nbDims == 2
+        } else {
+          reshape_after_fc_dim.nbDims = 2;
+          reshape_after_fc_dim.d[0] = 0;
+          reshape_after_fc_dim.d[1] = 0;
+        }
+        auto* reshape_after_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
+        reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim);
+
+        if (activation_type == "relu") {
+          reshape_after_fc_layer->setName(
+              ("shuffle_after_fc(Output: " + output_name + ")").c_str());
+          nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Activation, *(reshape_after_fc_layer->getOutput(0)),
+              nvinfer1::ActivationType::kRELU);
+          RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle",
+                                   {output_name}, test_mode);
+        } else {
+          RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc",
+                                   {output_name}, test_mode);
+        }
+      } else {
+        regist_fc(X, n_output, weight, bias);
+      }
       return;
     }
     // in order to handle situations in NLP models(input dims < 3,
@@ -154,12 +216,6 @@ class FcOpConverter : public OpConverter {
     auto input_d = X->getDimensions().d;
     int reshape_dim3[3] = {0};
     int reshape_dim4[4] = {0};
-    PADDLE_ENFORCE_EQ(
-        x_num_col_dims == 1 || x_num_col_dims == 2, true,
-        platform::errors::InvalidArgument(
-            "Wrong x_num_col_dims param of op mul. Paddle-TRT FC converter "
-            "expects x_num_col_dims is either 1 or 2, but got %d",
-            x_num_col_dims));
     PADDLE_ENFORCE_LE(x_num_col_dims, input_dims,
                       platform::errors::InvalidArgument(
                           "Params and input dims mismatch. Paddle-TRT FC "
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index ee04fd372c..8ce46a19d4 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -8,8 +8,8 @@ http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See
+the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
@@ -28,7 +28,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
                "network structure";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
-    // Shouble be a 5 dims tensor.
     auto* input = engine_->GetITensor(op_desc.Input("Input").front());
 
     // fc weights and fc bias
@@ -69,6 +68,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
     int head_number = BOOST_GET_CONST(int, op_desc.GetAttr("head_number"));
 
     nvinfer1::ILayer* layer = nullptr;
+    auto output_name = op_desc.Output("Out")[0];
 
     if (engine_->with_dynamic_shape()) {
       if (engine_->use_oss()) {
@@ -171,6 +171,12 @@ class MultiheadMatMulOpConverter : public OpConverter {
             plugin_inputs.data(), plugin_inputs.size(), *plugin);
         layer = plugin_layer;
       } else {
+        PADDLE_ENFORCE_EQ(
+            input->getDimensions().nbDims, 3,
+            platform::errors::InvalidArgument(
+                "The Input dim of the MultiheadMatMul should be 3, "
+                "but it's (%d) now.",
+                input->getDimensions().nbDims));
         // transpose weight_data from m * n to  n * m
         auto* input_bias_qk =
             engine_->GetITensor(op_desc.Input("BiasQK").front());
@@ -184,15 +190,37 @@ class MultiheadMatMulOpConverter : public OpConverter {
                                     static_cast<void*>(bias_data),
                                     static_cast<size_t>(bias_t->numel())};
 
-        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input,
-                                              n, weight.get(), bias.get());
-        auto* fc_out = fc_layer->getOutput(0);
+        // add shuffle before fc
+        nvinfer1::Dims reshape_before_fc_dim;
+        reshape_before_fc_dim.nbDims = 5;
+        reshape_before_fc_dim.d[0] = 0;
+        reshape_before_fc_dim.d[1] = 0;
+        reshape_before_fc_dim.d[2] = 0;
+        reshape_before_fc_dim.d[3] = 1;
+        reshape_before_fc_dim.d[4] = 1;
+        auto* reshape_before_fc_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+        reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim);
+        reshape_before_fc_layer->setName(
+            ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
+                .c_str());
+
+        // add layer fc
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n,
+            weight.get(), bias.get());
+        fc_layer->setName(
+            ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
+
+        // no need to add shuffle after fc, just change it in
+        // QkvToContextPluginDynamic
+
         // add qkv to context
         int head_size = hidden_out / head_number;
         float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
 
         std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.push_back(fc_out);
+        plugin_inputs.push_back(fc_layer->getOutput(0));
         plugin_inputs.push_back(input_bias_qk);
         bool with_fp16 =
             engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
@@ -208,7 +236,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
           "You can use the config.SetTRTDynamicShapeInfo(...) interface to set "
           "the shape information to run the dynamic shape mode."));
     }
-    auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "multihead_matmul", {output_name},
                              test_mode);
 #else
diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
index 79992065a2..9cefb24751 100644
--- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc
@@ -51,6 +51,7 @@ class SoftMaxOpConverter : public OpConverter {
     uint32_t axes = std::max(0, input_dims - 3);
     // TODO(cryoco): Poor workaround. Fix padded dims problem when TRT layers
     // support Nd.
+    // Tips: Dynammic shape alreay fixes.
     int padded_dims = 0;
     int explicit_batch = 0;
     if (engine_->with_dynamic_shape()) explicit_batch = 1;
@@ -62,16 +63,16 @@ class SoftMaxOpConverter : public OpConverter {
       }
     }
     if (!engine_->with_dynamic_shape()) {
-      if (axis == -1) {
-        axes = input_dims - 1 - padded_dims;
+      if (axis < 0) {
+        axes = input_dims + axis - padded_dims;
       } else {
-        axes = axis;
+        axes = axis - 1;
       }
     } else {
-      if (axis == -1) {
-        axes = input_dims - 1 - padded_dims;
+      if (axis < 0) {
+        axes = input_dims + axis;
       } else {
-        axes = axis + 1;
+        axes = axis;
       }
     }
     layer->setAxes(1 << axes);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 2ec94f5f98..11752d71a4 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -195,7 +195,17 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
       // current not support axis from input, use default 0
       if (!with_dynamic_shape || desc.Input("Axis").size() > 0) return false;
     }
-
+    if (op_type == "fc" || op_type == "mul") {
+      const int x_num_col_dims =
+          desc.HasAttr("x_num_col_dims")
+              ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims"))
+              : (desc.HasAttr("in_num_col_dims")
+                     ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims"))
+                     : 1);
+      if (x_num_col_dims != 1 && x_num_col_dims != 2) {
+        return false;
+      }
+    }
     if (op_type == "nearest_interp") {
       std::vector<std::string> attrs{"data_layout",   "interp_method",
                                      "align_corners", "scale",
diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
index 238daa4a88..6d3872aaeb 100644
--- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu
@@ -200,12 +200,10 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions(
                         "but it's (%d)",
                         output_index));
   nvinfer1::DimsExprs ret;
-  ret.nbDims = 5;
+  ret.nbDims = 3;
   ret.d[0] = inputs[0].d[0];
   ret.d[1] = inputs[0].d[1];
   ret.d[2] = expr_builder.constant(hidden_size_);
-  ret.d[3] = expr_builder.constant(1);
-  ret.d[4] = expr_builder.constant(1);
   return ret;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
index 1e7c83f4c6..a5fc9e73c5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -169,12 +169,10 @@ nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
           "it has (%d) inputs",
           nb_inputs));
   nvinfer1::DimsExprs ret;
-  ret.nbDims = 5;
+  ret.nbDims = 3;
   ret.d[0] = inputs[0].d[0];
   ret.d[1] = inputs[0].d[1];
   ret.d[2] = expr_builder.constant(head_size_ * head_number_);
-  ret.d[3] = expr_builder.constant(1);
-  ret.d[4] = expr_builder.constant(1);
   return ret;
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
index 3b9eea2219..7be9e3a740 100644
--- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu
@@ -54,11 +54,6 @@ void SkipLayerNormPluginDynamic::terminate() {
 nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
     int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
     nvinfer1::IExprBuilder &expr_builder) {
-  PADDLE_ENFORCE_EQ(
-      inputs[0].nbDims, 5,
-      platform::errors::InvalidArgument(
-          "The Input dim of the SkipLayernorm should be 5, but it's (%d) now.",
-          inputs[0].nbDims));
   return inputs[0];
 }
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
index 250b944652..fdb14f9cea 100644
--- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -62,6 +62,8 @@ nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
   output.d[1] = one;
   output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
                                        *inputs[1].d[0], *one);
+  // remove padding 1
+  output.nbDims -= 2;
 
   return output;
 }
-- 
GitLab