fix_multihead (#45429)

fa06d9c3 · Wangzheee · GitHub · a5e9ccda · fa06d9c3
隐藏空白更改
内联并排

Showing with 319 addition and 310 deletion

paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc ...e/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +319 -310

未找到文件。
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -291,333 +291,342 @@ class MultiheadMatMulOpConverter : public OpConverter {
              plugin_inputs.data(), plugin_inputs.size(), *plugin);
          layer = plugin_layer;
        }
-      }
-      if (input_dims.d[1] <= 384 && !bias_qk_attr &&
-          engine_->precision() != AnalysisConfig::Precision::kFloat32) {
-        /*
-           * input_dims.d[0]: batch(-1)
-           * input_dims.d[1]: length:256
-           * input_dims.d[2]: hidden_size:768
-           input
-             |[b,256,768]
-             |
-          shuffle                 weight   bias
-             |[b,256,768,1,1]      |         |
-             |_____________________|_________|
-             |
-            fc
-             |[b,256,2304,1,1]
-             |
-          shuffle                 mask(fake)  pos   max_length
-             |[b*256,2304,1,1]       |         |        |
-             |                       |         |        |
-             |_______________________|_________|________|
-             |
-             MHA
-             |[b*256,768]
-             |
-          shuffle
-             |[b, 256, 768]
-             |
-             out
-        */
-
-        nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
-                                 static_cast<void*>(weight_data),
-                                 static_cast<int32_t>(weight_t->numel())};
-        nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
-                               static_cast<void*>(bias_data),
-                               static_cast<int32_t>(bias_t->numel())};
-
-        /*** transpose the weight and bias ***/
-        int head_size = hidden_out / head_number;
-        // [3, head_number, head_size, hidden_in] -> [head_number, 3,
-        // head_size, hidden_in]
-        auto transpose_weight_v2 = [](const float* src,
-                                      float* dst,
-                                      int three,
-                                      int head_number,
-                                      int head_size,
-                                      int hidden_in) {
-          const int HH = head_size * hidden_in;
-          for (int i = 0; i < three; ++i) {
-            for (int n = 0; n < head_number; ++n) {
-              for (int hh = 0; hh < HH; ++hh) {
-                dst[n * three * HH + i * HH + hh] =
-                    src[i * head_number * HH + n * HH + hh];
+      } else {
+        if (input_dims.d[1] <= 384 && !bias_qk_attr &&
+            engine_->precision() != AnalysisConfig::Precision::kFloat32) {
+          /*
+            * input_dims.d[0]: batch(-1)
+            * input_dims.d[1]: length:256
+            * input_dims.d[2]: hidden_size:768
+            input
+              |[b,256,768]
+              |
+            shuffle                 weight   bias
+              |[b,256,768,1,1]      |         |
+              |_____________________|_________|
+              |
+              fc
+              |[b,256,2304,1,1]
+              |
+            shuffle                 mask(fake)  pos   max_length
+              |[b*256,2304,1,1]       |         |        |
+              |                       |         |        |
+              |_______________________|_________|________|
+              |
+              MHA
+              |[b*256,768]
+              |
+            shuffle
+              |[b, 256, 768]
+              |
+              out
+          */
+
+          nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
+                                   static_cast<void*>(weight_data),
+                                   static_cast<int32_t>(weight_t->numel())};
+          nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
+                                 static_cast<void*>(bias_data),
+                                 static_cast<int32_t>(bias_t->numel())};
+
+          /*** transpose the weight and bias ***/
+          int head_size = hidden_out / head_number;
+          // [3, head_number, head_size, hidden_in] -> [head_number, 3,
+          // head_size, hidden_in]
+          auto transpose_weight_v2 = [](const float* src,
+                                        float* dst,
+                                        int three,
+                                        int head_number,
+                                        int head_size,
+                                        int hidden_in) {
+            const int HH = head_size * hidden_in;
+            for (int i = 0; i < three; ++i) {
+              for (int n = 0; n < head_number; ++n) {
+                for (int hh = 0; hh < HH; ++hh) {
+                  dst[n * three * HH + i * HH + hh] =
+                      src[i * head_number * HH + n * HH + hh];
+                }
              }
            }
-          }
-        };
-        // [3, head_number, head_size] -> [head_number, 3, head_size]
-        auto transpose_bias_v2 =
-            [](const float* src, float* dst, int N, int H) {
-              for (int i = 0; i < 3; ++i) {
-                for (int n = 0; n < N; ++n) {
-                  for (int h = 0; h < H; ++h) {
-                    dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
+          };
+          // [3, head_number, head_size] -> [head_number, 3, head_size]
+          auto transpose_bias_v2 =
+              [](const float* src, float* dst, int N, int H) {
+                for (int i = 0; i < 3; ++i) {
+                  for (int n = 0; n < N; ++n) {
+                    for (int h = 0; h < H; ++h) {
+                      dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
+                    }
                  }
                }
-              }
-            };
-        memcpy(weight_data_tmp.data(),
-               weight_data,
-               weight_t->numel() * sizeof(float));
-        transpose_weight_v2(weight_data_tmp.data(),
-                            weight_data,
-                            three,
-                            head_number,
-                            head_size,
-                            hidden_in);
-
-        std::vector<float> bias_data_tmp;
-        bias_data_tmp.reserve(bias_t->numel());
-        memcpy(
-            bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
-        transpose_bias_v2(
-            bias_data_tmp.data(), bias_data, head_number, head_size);
-
-        // add shuffle for FullyConnected layer
-        std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
-        nvinfer1::ITensor* input_shape_tensor = Shape(input);
-        for (int i = 0; i < 5; i++) {
-          reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
-        }
-        for (int i = 0; i < 3; i++) {
-          reshape_before_fc_shape_tensor[i] =
-              GetEleTensorOfShape(input_shape_tensor, i);
-        }
-        auto* reshape_before_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-        reshape_before_fc_layer->setInput(
-            1, *Concat(reshape_before_fc_shape_tensor));
-        reshape_before_fc_layer->setName(
-            ("shuffle_before_fc_multihead_matmul(Output: " + output_name + ")")
-                .c_str());
-
-        // add fc layer
-        nvinfer1::ILayer* fc_layer = nullptr;
-        fc_layer = TRT_ENGINE_ADD_LAYER(engine_,
-                                        FullyConnected,
-                                        *reshape_before_fc_layer->getOutput(0),
-                                        n,
-                                        weight,
-                                        bias);
-
-        // add shuffle for CustomQKVToContextPluginDynamic layer
-        auto* reshape_after_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
-        std::vector<nvinfer1::ITensor*> mha_input_tensor_shape;
-        mha_input_tensor_shape.push_back(Add1DConstantLayer(-1));
-        mha_input_tensor_shape.push_back(
-            Add1DConstantLayer(hidden_out * 3));  // Q,K,V
-        mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
-        mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
-        reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape));
-        reshape_after_fc_layer->setName(
-            ("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")")
-                .c_str());
-
-        // add mha_plugin
-        auto creator = GetPluginRegistry()->getPluginCreator(
-            "CustomQKVToContextPluginDynamic", "2");
-        assert(creator != nullptr);
-        // set the attributes of mha_plugin
-        int type = static_cast<int>(nvinfer1::DataType::kHALF);
-        int var_seqlen = 1;
-        bool has_mask = true;
-        std::vector<nvinfer1::PluginField> fields{
-            {"hidden_size", &hidden_out, nvinfer1::PluginFieldType::kINT32, 1},
-            {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
-            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
-            {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
-            {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}};
-        nvinfer1::PluginFieldCollection* plugin_collection =
-            static_cast<nvinfer1::PluginFieldCollection*>(
-                malloc(sizeof(*plugin_collection) +
-                       fields.size() *
-                           sizeof(nvinfer1::PluginField)));  // remember to free
-        plugin_collection->nbFields = static_cast<int>(fields.size());
-        plugin_collection->fields = fields.data();
-        auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
-                                            plugin_collection);
-        free(plugin_collection);
-        // set inputs
-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        // input_0 for plugin
-        plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0));
-        // input_1(fake) for plugin
-        std::vector<int> mask = {1};
-        nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask);
-        plugin_inputs.emplace_back(mask_tensor);
-        // input_2 for plugin
-        std::vector<int> pos_id = {0};
-        int max_batch = 500;
-        for (int i = 1; i < max_batch; i++) {
-          pos_id.push_back(i);
-        }
-        nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id);
-        nvinfer1::ITensor* length_tensor =
-            GetEleTensorOfShape(input_shape_tensor, 1);
-        auto pos_id_layer =
-            TRT_ENGINE_ADD_LAYER(engine_,
-                                 ElementWise,
-                                 *fake_pos_id_tensor,
-                                 *length_tensor,
-                                 nvinfer1::ElementWiseOperation::kPROD);
-        // size = batch + 1;
-        nvinfer1::ITensor* batch_tensor =
-            GetEleTensorOfShape(input_shape_tensor, 0);
-        std::vector<int> const_data = {1};
-        nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data);
-        auto size_layer =
-            TRT_ENGINE_ADD_LAYER(engine_,
-                                 ElementWise,
-                                 *batch_tensor,
-                                 *const_tensor,
-                                 nvinfer1::ElementWiseOperation::kSUM);
-        // get size(batch + 1) data from pos_id_tensor
-        nvinfer1::Dims start;
-        nvinfer1::Dims stride;
-        nvinfer1::Dims size;
-
-        start.nbDims = 1;
-        stride.nbDims = 1;
-        size.nbDims = 1;
-
-        start.d[0] = 0;
-        stride.d[0] = 1;
-        size.d[0] = 1;
-
-        auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride);
-        slice_pos_layer->setInput(2, *size_layer->getOutput(0));
-        plugin_inputs.emplace_back(slice_pos_layer->getOutput(0));
-
-        // input_3 for plugin
-        std::vector<int> data(500, 1);
-        nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data);
-        auto* slice_max_layer = TRT_ENGINE_ADD_LAYER(
-            engine_, Slice, *fake_max_seqlen_tensor, start, size, stride);
-        slice_max_layer->setInput(2, *length_tensor);
-        plugin_inputs.emplace_back(slice_max_layer->getOutput(0));
-        // plugin_layer
-        auto plugin_layer = engine_->network()->addPluginV2(
-            plugin_inputs.data(), plugin_inputs.size(), *plugin);
-
-        // add shuffle
-        auto* reshape_after_mha_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *plugin_layer->getOutput(0));
-        std::vector<nvinfer1::ITensor*> reshape_tensor;
-        reshape_tensor.push_back(batch_tensor);
-        reshape_tensor.push_back(length_tensor);
-        reshape_tensor.push_back(Add1DConstantLayer(-1));
-        reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
-        reshape_after_mha_layer->setName(
-            ("shuffle_last_multihead_matmul(Output: " + output_name + ")")
-                .c_str());
-
-        // return
-        layer = reshape_after_mha_layer;
-      } else {
-        PADDLE_ENFORCE_EQ(
-            input->getDimensions().nbDims,
-            3,
-            platform::errors::InvalidArgument(
-                "The Input dim of the MultiheadMatMul should be 3, "
-                "but it's (%d) now.",
-                input->getDimensions().nbDims));
-        // transpose weight_data from m * n to  n * m
-        auto* input_bias_qk =
-            engine_->GetITensor(op_desc.Input("BiasQK").front());
-
-        TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                      static_cast<void*>(weight_data),
-                                      static_cast<size_t>(weight_t->numel())};
-        weight.dims.assign({n, m});
-
-        TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
-                                    static_cast<void*>(bias_data),
-                                    static_cast<size_t>(bias_t->numel())};
-
-        // add shuffle before fc
-        std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
-        nvinfer1::ITensor* input_shape_tensor = Shape(input);
-
-        for (int i = 0; i < 5; i++) {
-          reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
-        }
-        for (int i = 0; i < 3; i++) {
-          reshape_before_fc_shape_tensor[i] =
-              GetEleTensorOfShape(input_shape_tensor, i);
-        }
-        auto* reshape_before_fc_layer =
-            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
-        if (op_desc.HasAttr("Input_scale")) {
-          engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0),
-                                         in_scale);
-        }
-        reshape_before_fc_layer->setInput(
-            1, *Concat(reshape_before_fc_shape_tensor));
-        reshape_before_fc_layer->setName(
-            ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
-                .c_str());
-
-        // add layer fc
-        nvinfer1::ILayer* fc_layer = nullptr;
-        if (op_desc.HasAttr("Input_scale")) {
-          nvinfer1::DimsHW nv_ksize(1, 1);
-          fc_layer =
-              TRT_ENGINE_ADD_LAYER(engine_,
-                                   Convolution,
-                                   *reshape_before_fc_layer->getOutput(0),
-                                   n,
-                                   nv_ksize,
-                                   weight.get(),
-                                   bias.get());
-        } else {
+              };
+          memcpy(weight_data_tmp.data(),
+                 weight_data,
+                 weight_t->numel() * sizeof(float));
+          transpose_weight_v2(weight_data_tmp.data(),
+                              weight_data,
+                              three,
+                              head_number,
+                              head_size,
+                              hidden_in);
+
+          std::vector<float> bias_data_tmp;
+          bias_data_tmp.reserve(bias_t->numel());
+          memcpy(
+              bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
+          transpose_bias_v2(
+              bias_data_tmp.data(), bias_data, head_number, head_size);
+
+          // add shuffle for FullyConnected layer
+          std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
+          nvinfer1::ITensor* input_shape_tensor = Shape(input);
+          for (int i = 0; i < 5; i++) {
+            reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
+          }
+          for (int i = 0; i < 3; i++) {
+            reshape_before_fc_shape_tensor[i] =
+                GetEleTensorOfShape(input_shape_tensor, i);
+          }
+          auto* reshape_before_fc_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+          reshape_before_fc_layer->setInput(
+              1, *Concat(reshape_before_fc_shape_tensor));
+          reshape_before_fc_layer->setName(
+              ("shuffle_before_fc_multihead_matmul(Output: " + output_name +
+               ")")
+                  .c_str());
+
+          // add fc layer
+          nvinfer1::ILayer* fc_layer = nullptr;
          fc_layer =
              TRT_ENGINE_ADD_LAYER(engine_,
                                   FullyConnected,
                                   *reshape_before_fc_layer->getOutput(0),
                                   n,
-                                   weight.get(),
-                                   bias.get());
-        }
+                                   weight,
+                                   bias);
+
+          // add shuffle for CustomQKVToContextPluginDynamic layer
+          auto* reshape_after_fc_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0));
+          std::vector<nvinfer1::ITensor*> mha_input_tensor_shape;
+          mha_input_tensor_shape.push_back(Add1DConstantLayer(-1));
+          mha_input_tensor_shape.push_back(
+              Add1DConstantLayer(hidden_out * 3));  // Q,K,V
+          mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
+          mha_input_tensor_shape.push_back(Add1DConstantLayer(1));
+          reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape));
+          reshape_after_fc_layer->setName(
+              ("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")")
+                  .c_str());
+
+          // add mha_plugin
+          auto creator = GetPluginRegistry()->getPluginCreator(
+              "CustomQKVToContextPluginDynamic", "2");
+          assert(creator != nullptr);
+          // set the attributes of mha_plugin
+          int type = static_cast<int>(nvinfer1::DataType::kHALF);
+          int var_seqlen = 1;
+          bool has_mask = true;
+          std::vector<nvinfer1::PluginField> fields{
+              {"hidden_size",
+               &hidden_out,
+               nvinfer1::PluginFieldType::kINT32,
+               1},
+              {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
+              {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+              {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
+              {"var_seqlen",
+               &var_seqlen,
+               nvinfer1::PluginFieldType::kINT32,
+               1}};
+          nvinfer1::PluginFieldCollection* plugin_collection =
+              static_cast<nvinfer1::PluginFieldCollection*>(malloc(
+                  sizeof(*plugin_collection) +
+                  fields.size() *
+                      sizeof(nvinfer1::PluginField)));  // remember to free
+          plugin_collection->nbFields = static_cast<int>(fields.size());
+          plugin_collection->fields = fields.data();
+          auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
+                                              plugin_collection);
+          free(plugin_collection);
+          // set inputs
+          std::vector<nvinfer1::ITensor*> plugin_inputs;
+          // input_0 for plugin
+          plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0));
+          // input_1(fake) for plugin
+          std::vector<int> mask = {1};
+          nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask);
+          plugin_inputs.emplace_back(mask_tensor);
+          // input_2 for plugin
+          std::vector<int> pos_id = {0};
+          int max_batch = 500;
+          for (int i = 1; i < max_batch; i++) {
+            pos_id.push_back(i);
+          }
+          nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id);
+          nvinfer1::ITensor* length_tensor =
+              GetEleTensorOfShape(input_shape_tensor, 1);
+          auto pos_id_layer =
+              TRT_ENGINE_ADD_LAYER(engine_,
+                                   ElementWise,
+                                   *fake_pos_id_tensor,
+                                   *length_tensor,
+                                   nvinfer1::ElementWiseOperation::kPROD);
+          // size = batch + 1;
+          nvinfer1::ITensor* batch_tensor =
+              GetEleTensorOfShape(input_shape_tensor, 0);
+          std::vector<int> const_data = {1};
+          nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data);
+          auto size_layer =
+              TRT_ENGINE_ADD_LAYER(engine_,
+                                   ElementWise,
+                                   *batch_tensor,
+                                   *const_tensor,
+                                   nvinfer1::ElementWiseOperation::kSUM);
+          // get size(batch + 1) data from pos_id_tensor
+          nvinfer1::Dims start;
+          nvinfer1::Dims stride;
+          nvinfer1::Dims size;
+
+          start.nbDims = 1;
+          stride.nbDims = 1;
+          size.nbDims = 1;
+
+          start.d[0] = 0;
+          stride.d[0] = 1;
+          size.d[0] = 1;
+
+          auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride);
+          slice_pos_layer->setInput(2, *size_layer->getOutput(0));
+          plugin_inputs.emplace_back(slice_pos_layer->getOutput(0));
+
+          // input_3 for plugin
+          std::vector<int> data(500, 1);
+          nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data);
+          auto* slice_max_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Slice, *fake_max_seqlen_tensor, start, size, stride);
+          slice_max_layer->setInput(2, *length_tensor);
+          plugin_inputs.emplace_back(slice_max_layer->getOutput(0));
+          // plugin_layer
+          auto plugin_layer = engine_->network()->addPluginV2(
+              plugin_inputs.data(), plugin_inputs.size(), *plugin);
+
+          // add shuffle
+          auto* reshape_after_mha_layer = TRT_ENGINE_ADD_LAYER(
+              engine_, Shuffle, *plugin_layer->getOutput(0));
+          std::vector<nvinfer1::ITensor*> reshape_tensor;
+          reshape_tensor.push_back(batch_tensor);
+          reshape_tensor.push_back(length_tensor);
+          reshape_tensor.push_back(Add1DConstantLayer(-1));
+          reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor));
+          reshape_after_mha_layer->setName(
+              ("shuffle_last_multihead_matmul(Output: " + output_name + ")")
+                  .c_str());

-        if (op_desc.HasAttr("fc_out_threshold")) {
+          // return
+          layer = reshape_after_mha_layer;
+        } else {
          PADDLE_ENFORCE_EQ(
-              op_desc.HasAttr("fc_out_threshold"),
-              true,
+              input->getDimensions().nbDims,
+              3,
              platform::errors::InvalidArgument(
-                  "must have out threshold in multihead layers in int8 mode"));
-          float out_scale =
-              PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
-          engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
-        }
-        fc_layer->setName(
-            ("multihead_mamul_fc(Output: " + output_name + ")").c_str());
+                  "The Input dim of the MultiheadMatMul should be 3, "
+                  "but it's (%d) now.",
+                  input->getDimensions().nbDims));
+          // transpose weight_data from m * n to  n * m
+          auto* input_bias_qk =
+              engine_->GetITensor(op_desc.Input("BiasQK").front());
+
+          TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                        static_cast<void*>(weight_data),
+                                        static_cast<size_t>(weight_t->numel())};
+          weight.dims.assign({n, m});
+
+          TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(bias_data),
+                                      static_cast<size_t>(bias_t->numel())};
+
+          // add shuffle before fc
+          std::vector<nvinfer1::ITensor*> reshape_before_fc_shape_tensor;
+          nvinfer1::ITensor* input_shape_tensor = Shape(input);
+
+          for (int i = 0; i < 5; i++) {
+            reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1));
+          }
+          for (int i = 0; i < 3; i++) {
+            reshape_before_fc_shape_tensor[i] =
+                GetEleTensorOfShape(input_shape_tensor, i);
+          }
+          auto* reshape_before_fc_layer =
+              TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input);
+          if (op_desc.HasAttr("Input_scale")) {
+            engine_->SetTensorDynamicRange(
+                reshape_before_fc_layer->getOutput(0), in_scale);
+          }
+          reshape_before_fc_layer->setInput(
+              1, *Concat(reshape_before_fc_shape_tensor));
+          reshape_before_fc_layer->setName(
+              ("shuffle_before_multihead_mamul(Output: " + output_name + ")")
+                  .c_str());
+
+          // add layer fc
+          nvinfer1::ILayer* fc_layer = nullptr;
+          if (op_desc.HasAttr("Input_scale")) {
+            nvinfer1::DimsHW nv_ksize(1, 1);
+            fc_layer =
+                TRT_ENGINE_ADD_LAYER(engine_,
+                                     Convolution,
+                                     *reshape_before_fc_layer->getOutput(0),
+                                     n,
+                                     nv_ksize,
+                                     weight.get(),
+                                     bias.get());
+          } else {
+            fc_layer =
+                TRT_ENGINE_ADD_LAYER(engine_,
+                                     FullyConnected,
+                                     *reshape_before_fc_layer->getOutput(0),
+                                     n,
+                                     weight.get(),
+                                     bias.get());
+          }
+
+          if (op_desc.HasAttr("fc_out_threshold")) {
+            PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"),
+                              true,
+                              platform::errors::InvalidArgument(
+                                  "must have out threshold in multihead layers "
+                                  "in int8 mode"));
+            float out_scale =
+                PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold"));
+            engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale);
+          }
+          fc_layer->setName(
+              ("multihead_mamul_fc(Output: " + output_name + ")").c_str());

-        // no need to add shuffle after fc, just change it in
-        // QkvToContextPluginDynamic
+          // no need to add shuffle after fc, just change it in
+          // QkvToContextPluginDynamic

-        // add qkv to context
-        int head_size = hidden_out / head_number;
-        float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));
+          // add qkv to context
+          int head_size = hidden_out / head_number;
+          float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha"));

-        std::vector<nvinfer1::ITensor*> plugin_inputs;
-        plugin_inputs.push_back(fc_layer->getOutput(0));
-        plugin_inputs.push_back(input_bias_qk);
-        bool with_fp16 =
-            engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
+          std::vector<nvinfer1::ITensor*> plugin_inputs;
+          plugin_inputs.push_back(fc_layer->getOutput(0));
+          plugin_inputs.push_back(input_bias_qk);
+          bool with_fp16 =
+              engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();

-        if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
-          with_fp16 = true;
+          if (engine_->precision() == AnalysisConfig::Precision::kInt8) {
+            with_fp16 = true;
+          }
+          plugin::DynamicPluginTensorRT* plugin =
+              new plugin::QkvToContextPluginDynamic(
+                  hidden_in, head_number, head_size, scale, with_fp16);
+          layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
        }
-        plugin::DynamicPluginTensorRT* plugin =
-            new plugin::QkvToContextPluginDynamic(
-                hidden_in, head_number, head_size, scale, with_fp16);
-        layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin);
      }
    } else {
      PADDLE_THROW(platform::errors::Fatal(