diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index a597a484f9e585b30a997804514abf7b7a000cbb..8443c92241b0c3fa90e1bb7d37a53ded55e9cd9b 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -291,333 +291,342 @@ class MultiheadMatMulOpConverter : public OpConverter { plugin_inputs.data(), plugin_inputs.size(), *plugin); layer = plugin_layer; } - } - if (input_dims.d[1] <= 384 && !bias_qk_attr && - engine_->precision() != AnalysisConfig::Precision::kFloat32) { - /* - * input_dims.d[0]: batch(-1) - * input_dims.d[1]: length:256 - * input_dims.d[2]: hidden_size:768 - input - |[b,256,768] - | - shuffle weight bias - |[b,256,768,1,1] | | - |_____________________|_________| - | - fc - |[b,256,2304,1,1] - | - shuffle mask(fake) pos max_length - |[b*256,2304,1,1] | | | - | | | | - |_______________________|_________|________| - | - MHA - |[b*256,768] - | - shuffle - |[b, 256, 768] - | - out - */ - - nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT, - static_cast(weight_data), - static_cast(weight_t->numel())}; - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, - static_cast(bias_data), - static_cast(bias_t->numel())}; - - /*** transpose the weight and bias ***/ - int head_size = hidden_out / head_number; - // [3, head_number, head_size, hidden_in] -> [head_number, 3, - // head_size, hidden_in] - auto transpose_weight_v2 = [](const float* src, - float* dst, - int three, - int head_number, - int head_size, - int hidden_in) { - const int HH = head_size * hidden_in; - for (int i = 0; i < three; ++i) { - for (int n = 0; n < head_number; ++n) { - for (int hh = 0; hh < HH; ++hh) { - dst[n * three * HH + i * HH + hh] = - src[i * head_number * HH + n * HH + hh]; + } else { + if (input_dims.d[1] <= 384 && !bias_qk_attr && + engine_->precision() != AnalysisConfig::Precision::kFloat32) { + /* + * input_dims.d[0]: batch(-1) + * input_dims.d[1]: length:256 + * input_dims.d[2]: hidden_size:768 + input + |[b,256,768] + | + shuffle weight bias + |[b,256,768,1,1] | | + |_____________________|_________| + | + fc + |[b,256,2304,1,1] + | + shuffle mask(fake) pos max_length + |[b*256,2304,1,1] | | | + | | | | + |_______________________|_________|________| + | + MHA + |[b*256,768] + | + shuffle + |[b, 256, 768] + | + out + */ + + nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + static_cast(weight_t->numel())}; + nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, + static_cast(bias_data), + static_cast(bias_t->numel())}; + + /*** transpose the weight and bias ***/ + int head_size = hidden_out / head_number; + // [3, head_number, head_size, hidden_in] -> [head_number, 3, + // head_size, hidden_in] + auto transpose_weight_v2 = [](const float* src, + float* dst, + int three, + int head_number, + int head_size, + int hidden_in) { + const int HH = head_size * hidden_in; + for (int i = 0; i < three; ++i) { + for (int n = 0; n < head_number; ++n) { + for (int hh = 0; hh < HH; ++hh) { + dst[n * three * HH + i * HH + hh] = + src[i * head_number * HH + n * HH + hh]; + } } } - } - }; - // [3, head_number, head_size] -> [head_number, 3, head_size] - auto transpose_bias_v2 = - [](const float* src, float* dst, int N, int H) { - for (int i = 0; i < 3; ++i) { - for (int n = 0; n < N; ++n) { - for (int h = 0; h < H; ++h) { - dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h]; + }; + // [3, head_number, head_size] -> [head_number, 3, head_size] + auto transpose_bias_v2 = + [](const float* src, float* dst, int N, int H) { + for (int i = 0; i < 3; ++i) { + for (int n = 0; n < N; ++n) { + for (int h = 0; h < H; ++h) { + dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h]; + } } } - } - }; - memcpy(weight_data_tmp.data(), - weight_data, - weight_t->numel() * sizeof(float)); - transpose_weight_v2(weight_data_tmp.data(), - weight_data, - three, - head_number, - head_size, - hidden_in); - - std::vector bias_data_tmp; - bias_data_tmp.reserve(bias_t->numel()); - memcpy( - bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float)); - transpose_bias_v2( - bias_data_tmp.data(), bias_data, head_number, head_size); - - // add shuffle for FullyConnected layer - std::vector reshape_before_fc_shape_tensor; - nvinfer1::ITensor* input_shape_tensor = Shape(input); - for (int i = 0; i < 5; i++) { - reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1)); - } - for (int i = 0; i < 3; i++) { - reshape_before_fc_shape_tensor[i] = - GetEleTensorOfShape(input_shape_tensor, i); - } - auto* reshape_before_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); - reshape_before_fc_layer->setInput( - 1, *Concat(reshape_before_fc_shape_tensor)); - reshape_before_fc_layer->setName( - ("shuffle_before_fc_multihead_matmul(Output: " + output_name + ")") - .c_str()); - - // add fc layer - nvinfer1::ILayer* fc_layer = nullptr; - fc_layer = TRT_ENGINE_ADD_LAYER(engine_, - FullyConnected, - *reshape_before_fc_layer->getOutput(0), - n, - weight, - bias); - - // add shuffle for CustomQKVToContextPluginDynamic layer - auto* reshape_after_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); - std::vector mha_input_tensor_shape; - mha_input_tensor_shape.push_back(Add1DConstantLayer(-1)); - mha_input_tensor_shape.push_back( - Add1DConstantLayer(hidden_out * 3)); // Q,K,V - mha_input_tensor_shape.push_back(Add1DConstantLayer(1)); - mha_input_tensor_shape.push_back(Add1DConstantLayer(1)); - reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape)); - reshape_after_fc_layer->setName( - ("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")") - .c_str()); - - // add mha_plugin - auto creator = GetPluginRegistry()->getPluginCreator( - "CustomQKVToContextPluginDynamic", "2"); - assert(creator != nullptr); - // set the attributes of mha_plugin - int type = static_cast(nvinfer1::DataType::kHALF); - int var_seqlen = 1; - bool has_mask = true; - std::vector fields{ - {"hidden_size", &hidden_out, nvinfer1::PluginFieldType::kINT32, 1}, - {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1}, - {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, - {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1}, - {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}}; - nvinfer1::PluginFieldCollection* plugin_collection = - static_cast( - malloc(sizeof(*plugin_collection) + - fields.size() * - sizeof(nvinfer1::PluginField))); // remember to free - plugin_collection->nbFields = static_cast(fields.size()); - plugin_collection->fields = fields.data(); - auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic", - plugin_collection); - free(plugin_collection); - // set inputs - std::vector plugin_inputs; - // input_0 for plugin - plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0)); - // input_1(fake) for plugin - std::vector mask = {1}; - nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask); - plugin_inputs.emplace_back(mask_tensor); - // input_2 for plugin - std::vector pos_id = {0}; - int max_batch = 500; - for (int i = 1; i < max_batch; i++) { - pos_id.push_back(i); - } - nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id); - nvinfer1::ITensor* length_tensor = - GetEleTensorOfShape(input_shape_tensor, 1); - auto pos_id_layer = - TRT_ENGINE_ADD_LAYER(engine_, - ElementWise, - *fake_pos_id_tensor, - *length_tensor, - nvinfer1::ElementWiseOperation::kPROD); - // size = batch + 1; - nvinfer1::ITensor* batch_tensor = - GetEleTensorOfShape(input_shape_tensor, 0); - std::vector const_data = {1}; - nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data); - auto size_layer = - TRT_ENGINE_ADD_LAYER(engine_, - ElementWise, - *batch_tensor, - *const_tensor, - nvinfer1::ElementWiseOperation::kSUM); - // get size(batch + 1) data from pos_id_tensor - nvinfer1::Dims start; - nvinfer1::Dims stride; - nvinfer1::Dims size; - - start.nbDims = 1; - stride.nbDims = 1; - size.nbDims = 1; - - start.d[0] = 0; - stride.d[0] = 1; - size.d[0] = 1; - - auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER( - engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride); - slice_pos_layer->setInput(2, *size_layer->getOutput(0)); - plugin_inputs.emplace_back(slice_pos_layer->getOutput(0)); - - // input_3 for plugin - std::vector data(500, 1); - nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data); - auto* slice_max_layer = TRT_ENGINE_ADD_LAYER( - engine_, Slice, *fake_max_seqlen_tensor, start, size, stride); - slice_max_layer->setInput(2, *length_tensor); - plugin_inputs.emplace_back(slice_max_layer->getOutput(0)); - // plugin_layer - auto plugin_layer = engine_->network()->addPluginV2( - plugin_inputs.data(), plugin_inputs.size(), *plugin); - - // add shuffle - auto* reshape_after_mha_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *plugin_layer->getOutput(0)); - std::vector reshape_tensor; - reshape_tensor.push_back(batch_tensor); - reshape_tensor.push_back(length_tensor); - reshape_tensor.push_back(Add1DConstantLayer(-1)); - reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor)); - reshape_after_mha_layer->setName( - ("shuffle_last_multihead_matmul(Output: " + output_name + ")") - .c_str()); - - // return - layer = reshape_after_mha_layer; - } else { - PADDLE_ENFORCE_EQ( - input->getDimensions().nbDims, - 3, - platform::errors::InvalidArgument( - "The Input dim of the MultiheadMatMul should be 3, " - "but it's (%d) now.", - input->getDimensions().nbDims)); - // transpose weight_data from m * n to n * m - auto* input_bias_qk = - engine_->GetITensor(op_desc.Input("BiasQK").front()); - - TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, - static_cast(weight_data), - static_cast(weight_t->numel())}; - weight.dims.assign({n, m}); - - TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, - static_cast(bias_data), - static_cast(bias_t->numel())}; - - // add shuffle before fc - std::vector reshape_before_fc_shape_tensor; - nvinfer1::ITensor* input_shape_tensor = Shape(input); - - for (int i = 0; i < 5; i++) { - reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1)); - } - for (int i = 0; i < 3; i++) { - reshape_before_fc_shape_tensor[i] = - GetEleTensorOfShape(input_shape_tensor, i); - } - auto* reshape_before_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); - if (op_desc.HasAttr("Input_scale")) { - engine_->SetTensorDynamicRange(reshape_before_fc_layer->getOutput(0), - in_scale); - } - reshape_before_fc_layer->setInput( - 1, *Concat(reshape_before_fc_shape_tensor)); - reshape_before_fc_layer->setName( - ("shuffle_before_multihead_mamul(Output: " + output_name + ")") - .c_str()); - - // add layer fc - nvinfer1::ILayer* fc_layer = nullptr; - if (op_desc.HasAttr("Input_scale")) { - nvinfer1::DimsHW nv_ksize(1, 1); - fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, - Convolution, - *reshape_before_fc_layer->getOutput(0), - n, - nv_ksize, - weight.get(), - bias.get()); - } else { + }; + memcpy(weight_data_tmp.data(), + weight_data, + weight_t->numel() * sizeof(float)); + transpose_weight_v2(weight_data_tmp.data(), + weight_data, + three, + head_number, + head_size, + hidden_in); + + std::vector bias_data_tmp; + bias_data_tmp.reserve(bias_t->numel()); + memcpy( + bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float)); + transpose_bias_v2( + bias_data_tmp.data(), bias_data, head_number, head_size); + + // add shuffle for FullyConnected layer + std::vector reshape_before_fc_shape_tensor; + nvinfer1::ITensor* input_shape_tensor = Shape(input); + for (int i = 0; i < 5; i++) { + reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1)); + } + for (int i = 0; i < 3; i++) { + reshape_before_fc_shape_tensor[i] = + GetEleTensorOfShape(input_shape_tensor, i); + } + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + reshape_before_fc_layer->setInput( + 1, *Concat(reshape_before_fc_shape_tensor)); + reshape_before_fc_layer->setName( + ("shuffle_before_fc_multihead_matmul(Output: " + output_name + + ")") + .c_str()); + + // add fc layer + nvinfer1::ILayer* fc_layer = nullptr; fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), n, - weight.get(), - bias.get()); - } + weight, + bias); + + // add shuffle for CustomQKVToContextPluginDynamic layer + auto* reshape_after_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); + std::vector mha_input_tensor_shape; + mha_input_tensor_shape.push_back(Add1DConstantLayer(-1)); + mha_input_tensor_shape.push_back( + Add1DConstantLayer(hidden_out * 3)); // Q,K,V + mha_input_tensor_shape.push_back(Add1DConstantLayer(1)); + mha_input_tensor_shape.push_back(Add1DConstantLayer(1)); + reshape_after_fc_layer->setInput(1, *Concat(mha_input_tensor_shape)); + reshape_after_fc_layer->setName( + ("shuffle_after_fc_multihead_matmul(Output: " + output_name + ")") + .c_str()); + + // add mha_plugin + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomQKVToContextPluginDynamic", "2"); + assert(creator != nullptr); + // set the attributes of mha_plugin + int type = static_cast(nvinfer1::DataType::kHALF); + int var_seqlen = 1; + bool has_mask = true; + std::vector fields{ + {"hidden_size", + &hidden_out, + nvinfer1::PluginFieldType::kINT32, + 1}, + {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1}, + {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, + {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1}, + {"var_seqlen", + &var_seqlen, + nvinfer1::PluginFieldType::kINT32, + 1}}; + nvinfer1::PluginFieldCollection* plugin_collection = + static_cast(malloc( + sizeof(*plugin_collection) + + fields.size() * + sizeof(nvinfer1::PluginField))); // remember to free + plugin_collection->nbFields = static_cast(fields.size()); + plugin_collection->fields = fields.data(); + auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic", + plugin_collection); + free(plugin_collection); + // set inputs + std::vector plugin_inputs; + // input_0 for plugin + plugin_inputs.emplace_back(reshape_after_fc_layer->getOutput(0)); + // input_1(fake) for plugin + std::vector mask = {1}; + nvinfer1::ITensor* mask_tensor = Add1DConstantLayer(mask); + plugin_inputs.emplace_back(mask_tensor); + // input_2 for plugin + std::vector pos_id = {0}; + int max_batch = 500; + for (int i = 1; i < max_batch; i++) { + pos_id.push_back(i); + } + nvinfer1::ITensor* fake_pos_id_tensor = Add1DConstantLayer(pos_id); + nvinfer1::ITensor* length_tensor = + GetEleTensorOfShape(input_shape_tensor, 1); + auto pos_id_layer = + TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *fake_pos_id_tensor, + *length_tensor, + nvinfer1::ElementWiseOperation::kPROD); + // size = batch + 1; + nvinfer1::ITensor* batch_tensor = + GetEleTensorOfShape(input_shape_tensor, 0); + std::vector const_data = {1}; + nvinfer1::ITensor* const_tensor = Add1DConstantLayer(const_data); + auto size_layer = + TRT_ENGINE_ADD_LAYER(engine_, + ElementWise, + *batch_tensor, + *const_tensor, + nvinfer1::ElementWiseOperation::kSUM); + // get size(batch + 1) data from pos_id_tensor + nvinfer1::Dims start; + nvinfer1::Dims stride; + nvinfer1::Dims size; + + start.nbDims = 1; + stride.nbDims = 1; + size.nbDims = 1; + + start.d[0] = 0; + stride.d[0] = 1; + size.d[0] = 1; + + auto* slice_pos_layer = TRT_ENGINE_ADD_LAYER( + engine_, Slice, *pos_id_layer->getOutput(0), start, size, stride); + slice_pos_layer->setInput(2, *size_layer->getOutput(0)); + plugin_inputs.emplace_back(slice_pos_layer->getOutput(0)); + + // input_3 for plugin + std::vector data(500, 1); + nvinfer1::ITensor* fake_max_seqlen_tensor = Add1DConstantLayer(data); + auto* slice_max_layer = TRT_ENGINE_ADD_LAYER( + engine_, Slice, *fake_max_seqlen_tensor, start, size, stride); + slice_max_layer->setInput(2, *length_tensor); + plugin_inputs.emplace_back(slice_max_layer->getOutput(0)); + // plugin_layer + auto plugin_layer = engine_->network()->addPluginV2( + plugin_inputs.data(), plugin_inputs.size(), *plugin); + + // add shuffle + auto* reshape_after_mha_layer = TRT_ENGINE_ADD_LAYER( + engine_, Shuffle, *plugin_layer->getOutput(0)); + std::vector reshape_tensor; + reshape_tensor.push_back(batch_tensor); + reshape_tensor.push_back(length_tensor); + reshape_tensor.push_back(Add1DConstantLayer(-1)); + reshape_after_mha_layer->setInput(1, *Concat(reshape_tensor)); + reshape_after_mha_layer->setName( + ("shuffle_last_multihead_matmul(Output: " + output_name + ")") + .c_str()); - if (op_desc.HasAttr("fc_out_threshold")) { + // return + layer = reshape_after_mha_layer; + } else { PADDLE_ENFORCE_EQ( - op_desc.HasAttr("fc_out_threshold"), - true, + input->getDimensions().nbDims, + 3, platform::errors::InvalidArgument( - "must have out threshold in multihead layers in int8 mode")); - float out_scale = - PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); - engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); - } - fc_layer->setName( - ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); + "The Input dim of the MultiheadMatMul should be 3, " + "but it's (%d) now.", + input->getDimensions().nbDims)); + // transpose weight_data from m * n to n * m + auto* input_bias_qk = + engine_->GetITensor(op_desc.Input("BiasQK").front()); + + TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, + static_cast(weight_data), + static_cast(weight_t->numel())}; + weight.dims.assign({n, m}); + + TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT, + static_cast(bias_data), + static_cast(bias_t->numel())}; + + // add shuffle before fc + std::vector reshape_before_fc_shape_tensor; + nvinfer1::ITensor* input_shape_tensor = Shape(input); + + for (int i = 0; i < 5; i++) { + reshape_before_fc_shape_tensor.push_back(Add1DConstantLayer(1)); + } + for (int i = 0; i < 3; i++) { + reshape_before_fc_shape_tensor[i] = + GetEleTensorOfShape(input_shape_tensor, i); + } + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + if (op_desc.HasAttr("Input_scale")) { + engine_->SetTensorDynamicRange( + reshape_before_fc_layer->getOutput(0), in_scale); + } + reshape_before_fc_layer->setInput( + 1, *Concat(reshape_before_fc_shape_tensor)); + reshape_before_fc_layer->setName( + ("shuffle_before_multihead_mamul(Output: " + output_name + ")") + .c_str()); + + // add layer fc + nvinfer1::ILayer* fc_layer = nullptr; + if (op_desc.HasAttr("Input_scale")) { + nvinfer1::DimsHW nv_ksize(1, 1); + fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, + Convolution, + *reshape_before_fc_layer->getOutput(0), + n, + nv_ksize, + weight.get(), + bias.get()); + } else { + fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, + FullyConnected, + *reshape_before_fc_layer->getOutput(0), + n, + weight.get(), + bias.get()); + } + + if (op_desc.HasAttr("fc_out_threshold")) { + PADDLE_ENFORCE_EQ(op_desc.HasAttr("fc_out_threshold"), + true, + platform::errors::InvalidArgument( + "must have out threshold in multihead layers " + "in int8 mode")); + float out_scale = + PADDLE_GET_CONST(float, op_desc.GetAttr("fc_out_threshold")); + engine_->SetTensorDynamicRange(fc_layer->getOutput(0), out_scale); + } + fc_layer->setName( + ("multihead_mamul_fc(Output: " + output_name + ")").c_str()); - // no need to add shuffle after fc, just change it in - // QkvToContextPluginDynamic + // no need to add shuffle after fc, just change it in + // QkvToContextPluginDynamic - // add qkv to context - int head_size = hidden_out / head_number; - float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha")); + // add qkv to context + int head_size = hidden_out / head_number; + float scale = PADDLE_GET_CONST(float, op_desc.GetAttr("alpha")); - std::vector plugin_inputs; - plugin_inputs.push_back(fc_layer->getOutput(0)); - plugin_inputs.push_back(input_bias_qk); - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + std::vector plugin_inputs; + plugin_inputs.push_back(fc_layer->getOutput(0)); + plugin_inputs.push_back(input_bias_qk); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - if (engine_->precision() == AnalysisConfig::Precision::kInt8) { - with_fp16 = true; + if (engine_->precision() == AnalysisConfig::Precision::kInt8) { + with_fp16 = true; + } + plugin::DynamicPluginTensorRT* plugin = + new plugin::QkvToContextPluginDynamic( + hidden_in, head_number, head_size, scale, with_fp16); + layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin); } - plugin::DynamicPluginTensorRT* plugin = - new plugin::QkvToContextPluginDynamic( - hidden_in, head_number, head_size, scale, with_fp16); - layer = engine_->AddDynamicPlugin(plugin_inputs.data(), 2, plugin); } } else { PADDLE_THROW(platform::errors::Fatal(