diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 88384b39b0c27a54a80f12765f84dd751267557d..4ab5d5d5fccab58600a42f40366a2be1fe6ff56d 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -40,6 +40,9 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { input_ids.push_back(engine_->GetITensor(id_names[i])); } + // input_embs[0]: word_embedding + // input_embs[1]: pos_embedding + // input_embs[2]: sent_embedding std::vector input_embs; std::vector emb_sizes; @@ -54,7 +57,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { return temp_data; }; - int hidden = 0; + // int hidden = 0; for (int i = 0; i < input_num; i++) { framework::DDim emb_dims; float* emb_data = get_persistable_data(emb_names[i], &emb_dims); @@ -65,7 +68,7 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { emb_dims.size(), 2, platform::errors::InvalidArgument( "The fused EmbEltwiseLayerNorm's emb should be 2 dims.")); - hidden = emb_dims[1]; + // hidden = emb_dims[1]; } framework::DDim bias_dims, scale_dims; @@ -76,21 +79,66 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { get_persistable_data(op_desc.Input("Scale").front(), &scale_dims); int64_t bias_size = framework::product(bias_dims); int64_t scale_size = framework::product(scale_dims); - float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); nvinfer1::ILayer* layer = nullptr; + int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); + if (engine_->with_dynamic_shape()) { +#ifdef USE_NVINFER_PLUGIN + const std::vector fields{ + {"bert_embeddings_layernorm_beta", bias, + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(bias_size)}, + {"bert_embeddings_layernorm_gamma", scale, + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(scale_size)}, + {"bert_embeddings_word_embeddings", input_embs[0], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[0])}, + {"bert_embeddings_token_type_embeddings", input_embs[2], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[2])}, + {"bert_embeddings_position_embeddings", input_embs[1], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[1])}, + {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + // remember to free + nvinfer1::PluginFieldCollection* plugin_ptr = + static_cast( + malloc(sizeof(*plugin_ptr) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_ptr->nbFields = static_cast(fields.size()); + plugin_ptr->fields = fields.data(); + + std::vector plugin_inputs; + plugin_inputs.emplace_back(engine_->GetITensor( + "eval_placeholder_0")); // word_embedding, eval_placeholder_0 + plugin_inputs.emplace_back(engine_->GetITensor( + "eval_placeholder_1")); // sent_embedding, eval_placeholder_1 + plugin_inputs.emplace_back(engine_->GetITensor( + "eval_placeholder_2")); // cu_seqlens, eval_placeholder_2 + plugin_inputs.emplace_back(engine_->GetITensor( + "eval_placeholder_3")); // max_seqlen, eval_placeholder_3 + + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomEmbLayerNormPluginDynamic", "2"); + + auto plugin_obj = + creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr); + auto plugin_layer = engine_->network()->addPluginV2( + plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); + layer = plugin_layer; + free(plugin_ptr); +#else + float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); plugin::DynamicPluginTensorRT* plugin = nullptr; - plugin = new plugin::EmbEltwiseLayernormPluginDynamic( + plugin = new plugin::EmbLaynormPluginDynamicV2( input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, eps); - auto plugin_layer = - engine_->AddPluginV2(input_ids.data(), input_num, plugin); - nvinfer1::Permutation permutation{1, 0, 2, 3, 4}; - auto trans_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *plugin_layer->getOutput(0)); - trans_layer->setFirstTranspose(permutation); - layer = trans_layer; + layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); +#endif } else { PADDLE_THROW(platform::errors::Fatal( "You are running the Ernie(Bert) model in static" @@ -100,7 +148,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { } auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, + RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", + {output_name, std::string("qkv_plugin_mask")}, test_mode); #else PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index 69192fbe13a2476a7203f0b7e868a99081b4af6b..b14ff7a96d21d647d5593ee3f8a76ee496c09529 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -26,33 +26,38 @@ class MulOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias"; - - framework::OpDesc op_desc(op, nullptr); - // Declare inputs - auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); - auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]); - - bool transpose_x = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_X")); - bool transpose_y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y")); - -#ifdef USE_NVINFER_PLUGIN - nvinfer1::DataType type = (engine_->WithFp16() == 1) - ? nvinfer1::DataType::kHALF - : nvinfer1::DataType::kFLOAT; - plugin::ConvertMaskPluginDynamic* plugin = - new plugin::ConvertMaskPluginDynamic(type); - auto convert_mask_layer = engine_->AddPluginV2(&input1, 1, plugin); - engine_->SetITensor("qkv_plugin_mask", convert_mask_layer->getOutput(0)); -#endif - - // Both the input1 and input2 do not need transpose. - auto* layer = TRT_ENGINE_ADD_LAYER( - engine_, MatrixMultiply, *const_cast(input1), - transpose_x, *const_cast(input2), transpose_y); - - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "matmul", {output_name}, test_mode); + /* + VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]); + + bool transpose_x = BOOST_GET_CONST(bool, + op_desc.GetAttr("transpose_X")); + bool transpose_y = BOOST_GET_CONST(bool, + op_desc.GetAttr("transpose_Y")); + + #ifdef USE_NVINFER_PLUGIN + nvinfer1::DataType type = (engine_->WithFp16() == 1) + ? nvinfer1::DataType::kHALF + : nvinfer1::DataType::kFLOAT; + plugin::ConvertMaskPluginDynamic* plugin = + new plugin::ConvertMaskPluginDynamic(type); + auto convert_mask_layer = engine_->AddPluginV2(&input1, 1, plugin); + engine_->SetITensor("qkv_plugin_mask", + convert_mask_layer->getOutput(0)); + #endif + + // Both the input1 and input2 do not need transpose. + auto* layer = TRT_ENGINE_ADD_LAYER( + engine_, MatrixMultiply, *const_cast(input1), + transpose_x, *const_cast(input2), transpose_y); + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "matmul", {output_name}, test_mode); + */ } }; diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index d30f0ff1293f2af72d6c5320993089b51f21adfd..4bce6eac8c33f04293ca0b5362bee21442862ba2 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -119,17 +119,19 @@ class MultiheadMatMulOpConverter : public OpConverter { auto mask_tensor = engine_->GetITensor("qkv_plugin_mask"); auto creator = GetPluginRegistry()->getPluginCreator( - "CustomQKVToContextPluginDynamic", "1"); + "CustomQKVToContextPluginDynamic", "2"); assert(creator != nullptr); int type = static_cast((engine_->WithFp16() == 1) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT); bool has_mask = true; + int var_seqlen = 1; const std::vector fields{ {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, {"hidden_size", &hidden, nvinfer1::PluginFieldType::kINT32, 1}, {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1}, {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1}, + {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1}, }; nvinfer1::PluginFieldCollection* plugin_collection = static_cast( @@ -144,8 +146,12 @@ class MultiheadMatMulOpConverter : public OpConverter { free(plugin_collection); std::vector plugin_inputs; - plugin_inputs.push_back(fc_layer->getOutput(0)); - plugin_inputs.push_back(mask_tensor); + plugin_inputs.emplace_back(fc_layer->getOutput(0)); + plugin_inputs.emplace_back(mask_tensor); + plugin_inputs.emplace_back(engine_->GetITensor( + "eval_placeholder_2")); // cu_seqlens, eval_placeholder_2 + plugin_inputs.emplace_back(engine_->GetITensor( + "eval_placeholder_3")); // max_seqlen, eval_placeholder_3 auto plugin_layer = engine_->network()->addPluginV2( plugin_inputs.data(), plugin_inputs.size(), *plugin); layer = plugin_layer; diff --git a/paddle/fluid/inference/tensorrt/convert/scale_op.cc b/paddle/fluid/inference/tensorrt/convert/scale_op.cc index f9a1fe41ddc046aad8cc3a5397453b0f68c1a112..54e8106d8eb6ec2e99e2dc06ec24f41890d2f7b2 100644 --- a/paddle/fluid/inference/tensorrt/convert/scale_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/scale_op.cc @@ -25,85 +25,94 @@ class ScaleOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(3) << "convert a fluid scale op to tensorrt mul layer without bias"; - - framework::OpDesc op_desc(op, nullptr); - // Declare inputs - std::vector itensors; - std::string input_name = op_desc.Input("X").front(); - std::string out_name = op_desc.Output("Out").front(); - - auto input = engine_->GetITensor(input_name); - bool bias_after_scale = - BOOST_GET_CONST(bool, op_desc.GetAttr("bias_after_scale")); - float bias = BOOST_GET_CONST(float, op_desc.GetAttr("bias")); - float scale = BOOST_GET_CONST(float, op_desc.GetAttr("scale")); - auto create_weights = [&](float data, std::string type) -> float* { - std::unique_ptr tmp_tensor(new framework::Tensor()); - tmp_tensor->Resize({1}); - auto* tmp_data = tmp_tensor->mutable_data(platform::CPUPlace()); - tmp_data[0] = data; - engine_->SetWeights(out_name + "_scale_op_" + type, - std::move(tmp_tensor)); - return tmp_data; - }; - - float* bias_ptr = create_weights(bias, "bias"); - float* scale_ptr = create_weights(scale, "scale"); - - TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, - static_cast(scale_ptr), 1}; - TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, - static_cast(bias_ptr), 1}; - TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, nullptr, - 0}; - nvinfer1::ILayer* layer = nullptr; - - auto input_dim = input->getDimensions(); - PADDLE_ENFORCE_GE(input_dim.nbDims, 3, - platform::errors::Fatal( - "Paddle-TRT scale mode only support dimension >= 3")); - - nvinfer1::IShuffleLayer* expand_layer = nullptr; - nvinfer1::IShuffleLayer* squeeze_layer = nullptr; - - if (input_dim.nbDims == 3) { - // TensorRT scale layer is not supporting input dims < 4 when using - // explicit batch - expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); - nvinfer1::Dims4 target_shape(0, 0, 0, 1); // expand 1 dims - expand_layer->setReshapeDimensions(target_shape); - input = expand_layer->getOutput(0); - } - - if (bias_after_scale) { - layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, - shift_weights.get(), scale_weights.get(), power_weights.get()); - } else { - // add bias - layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *(input), nvinfer1::ScaleMode::kUNIFORM, - shift_weights.get(), power_weights.get(), power_weights.get()); - // mul scale - layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *(layer->getOutput(0)), nvinfer1::ScaleMode::kUNIFORM, - power_weights.get(), scale_weights.get(), power_weights.get()); - } - - PADDLE_ENFORCE_EQ(layer != nullptr, true, - platform::errors::Fatal("Create scale layer failed.")); - - if (input_dim.nbDims == 3) { - // TensorRT scale layer is not supporting input dims < 4 when using - // explicit batch - squeeze_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); - nvinfer1::Dims3 target_shape(0, 0, 0); // expand 1 dims - squeeze_layer->setReshapeDimensions(target_shape); - layer = static_cast(squeeze_layer); - } - RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode); + /* + VLOG(3) << "convert a fluid scale op to tensorrt mul layer without + bias"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + std::vector itensors; + std::string input_name = op_desc.Input("X").front(); + std::string out_name = op_desc.Output("Out").front(); + + auto input = engine_->GetITensor(input_name); + bool bias_after_scale = + BOOST_GET_CONST(bool, op_desc.GetAttr("bias_after_scale")); + float bias = BOOST_GET_CONST(float, op_desc.GetAttr("bias")); + float scale = BOOST_GET_CONST(float, op_desc.GetAttr("scale")); + auto create_weights = [&](float data, std::string type) -> float* { + std::unique_ptr tmp_tensor(new + framework::Tensor()); + tmp_tensor->Resize({1}); + auto* tmp_data = + tmp_tensor->mutable_data(platform::CPUPlace()); + tmp_data[0] = data; + engine_->SetWeights(out_name + "_scale_op_" + type, + std::move(tmp_tensor)); + return tmp_data; + }; + + float* bias_ptr = create_weights(bias, "bias"); + float* scale_ptr = create_weights(scale, "scale"); + + TensorRTEngine::Weight scale_weights{nvinfer1::DataType::kFLOAT, + static_cast(scale_ptr), 1}; + TensorRTEngine::Weight shift_weights{nvinfer1::DataType::kFLOAT, + static_cast(bias_ptr), 1}; + TensorRTEngine::Weight power_weights{nvinfer1::DataType::kFLOAT, + nullptr, + 0}; + nvinfer1::ILayer* layer = nullptr; + + auto input_dim = input->getDimensions(); + PADDLE_ENFORCE_GE(input_dim.nbDims, 3, + platform::errors::Fatal( + "Paddle-TRT scale mode only support dimension >= + 3")); + + nvinfer1::IShuffleLayer* expand_layer = nullptr; + nvinfer1::IShuffleLayer* squeeze_layer = nullptr; + + if (input_dim.nbDims == 3) { + // TensorRT scale layer is not supporting input dims < 4 when using + // explicit batch + expand_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + nvinfer1::Dims4 target_shape(0, 0, 0, 1); // expand 1 dims + expand_layer->setReshapeDimensions(target_shape); + input = expand_layer->getOutput(0); + } + + if (bias_after_scale) { + layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *input, nvinfer1::ScaleMode::kUNIFORM, + shift_weights.get(), scale_weights.get(), power_weights.get()); + } else { + // add bias + layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *(input), nvinfer1::ScaleMode::kUNIFORM, + shift_weights.get(), power_weights.get(), power_weights.get()); + // mul scale + layer = TRT_ENGINE_ADD_LAYER( + engine_, Scale, *(layer->getOutput(0)), + nvinfer1::ScaleMode::kUNIFORM, + power_weights.get(), scale_weights.get(), power_weights.get()); + } + + PADDLE_ENFORCE_EQ(layer != nullptr, true, + platform::errors::Fatal("Create scale layer + failed.")); + + if (input_dim.nbDims == 3) { + // TensorRT scale layer is not supporting input dims < 4 when using + // explicit batch + squeeze_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *(layer->getOutput(0))); + nvinfer1::Dims3 target_shape(0, 0, 0); // expand 1 dims + squeeze_layer->setReshapeDimensions(target_shape); + layer = static_cast(squeeze_layer); + } + RreplenishLayerAndOutput(layer, "scale", {out_name}, test_mode); + */ } }; diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index 20b471a4c2af4edbff04368f4609ac4865ae1513..0dc7bc61dcb350c601215000b74ad79beb168a7c 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -54,7 +54,7 @@ class SkipLayerNormOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { #ifdef USE_NVINFER_PLUGIN auto creator = GetPluginRegistry()->getPluginCreator( - "CustomSkipLayerNormPluginDynamic", "1"); + "CustomSkipLayerNormPluginDynamic", "2"); assert(creator != nullptr); int type = static_cast((engine_->WithFp16() == 1) ? nvinfer1::DataType::kHALF diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index a8784ba51389b13f5c8f46df0922efd3a788c8d8..316ac8e353f12b8e39890c47234cc341bc1baf35 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -13,7 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h" +// #include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h" namespace paddle { namespace inference { @@ -30,25 +31,30 @@ class SliceOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs auto* input = engine_->GetITensor(op_desc.Input("Input")[0]); - - std::vector axes = - BOOST_GET_CONST(std::vector, op_desc.GetAttr("axes")); - std::vector starts = - BOOST_GET_CONST(std::vector, op_desc.GetAttr("starts")); - std::vector ends = - BOOST_GET_CONST(std::vector, op_desc.GetAttr("ends")); + /* + std::vector axes = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("axes")); + std::vector starts = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("starts")); + std::vector ends = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("ends")); + */ nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { - nvinfer1::Permutation permutation{1, 0, 2, 3, 4}; - auto trans_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); - trans_layer->setFirstTranspose(permutation); + /* + nvinfer1::Permutation permutation{1, 0, 2, 3, 4}; + auto trans_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); + trans_layer->setFirstTranspose(permutation); + */ std::vector plugin_inputs; - plugin_inputs.emplace_back(trans_layer->getOutput(0)); + // plugin_inputs.emplace_back(trans_layer->getOutput(0)); + plugin_inputs.emplace_back(input); + plugin_inputs.emplace_back(engine_->GetITensor("eval_placeholder_2")); - bool ban_fp16 = engine_->disable_trt_plugin_fp16(); - plugin::SlicePluginDynamic* plugin = - new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16); + // bool ban_fp16 = engine_->disable_trt_plugin_fp16(); + plugin::SpecialSlicePluginDynamic* plugin = + new plugin::SpecialSlicePluginDynamic(); layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(), plugin); } else { diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc index f35024529c61a253f314e5eca985713227d3f343..15cbcc80180e2224413fb77ff4e5d1f351979d09 100644 --- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc @@ -26,45 +26,48 @@ class StackOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode) override { - VLOG(4) << "convert fluid stack op to tensorrt stack layer"; + /* + VLOG(4) << "convert fluid stack op to tensorrt stack layer"; - framework::OpDesc op_desc(op, nullptr); - auto input = op_desc.Input("X"); - int input_num = input.size(); - nvinfer1::ITensor** inputs = - (nvinfer1::ITensor**)malloc(input_num * sizeof(nvinfer1::ITensor*)); + framework::OpDesc op_desc(op, nullptr); + auto input = op_desc.Input("X"); + int input_num = input.size(); + nvinfer1::ITensor** inputs = + (nvinfer1::ITensor**)malloc(input_num * sizeof(nvinfer1::ITensor*)); - for (int i = 0; i < input_num; ++i) { - inputs[i] = engine_->GetITensor(input[i]); - } + for (int i = 0; i < input_num; ++i) { + inputs[i] = engine_->GetITensor(input[i]); + } - int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); - if (axis < 0) { - axis = axis + inputs[0]->getDimensions().nbDims + 1; - } + int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); + if (axis < 0) { + axis = axis + inputs[0]->getDimensions().nbDims + 1; + } - nvinfer1::ILayer* layer = nullptr; - if (engine_->with_dynamic_shape()) { -#if IS_TRT_VERSION_GE(6000) - plugin::StackPluginDynamic* plugin = - new plugin::StackPluginDynamic(axis, input_num); - layer = engine_->AddPluginV2(inputs, input_num, plugin); - assert(layer != nullptr); -#else - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); -#endif - } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); - } - auto output_name = op_desc.Output("Y").front(); - RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode); - free(inputs); + nvinfer1::ILayer* layer = nullptr; + if (engine_->with_dynamic_shape()) { + #if IS_TRT_VERSION_GE(6000) + plugin::StackPluginDynamic* plugin = + new plugin::StackPluginDynamic(axis, input_num); + layer = engine_->AddPluginV2(inputs, input_num, plugin); + assert(layer != nullptr); + #else + PADDLE_THROW(platform::errors::Fatal( + "You are running the TRT Dynamic Shape mode, need to confirm that + " + "your TRT version is no less than 6.0")); + #endif + } else { + PADDLE_THROW(platform::errors::Fatal( + "You are running the Ernie(Bert) model in static" + "shape mode, which is not supported for the time being.\n" + "You can use the config.SetTRTDynamicShapeInfo(...) interface" + " to set the shape information to run the dynamic shape mode.")); + } + auto output_name = op_desc.Output("Y").front(); + RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode); + free(inputs); + */ } }; diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 22d28a44cb8bbb89c85bb94eb8825bcad05bdc26..22fed0b69763900e4f9efd8cbd61ef7d53d46ce0 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -60,9 +60,9 @@ TRT_DT FluidDataType2TRT(FluidDT type) { template nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, bool with_dynamic_shape = false) { - PADDLE_ENFORCE_GT(shape.size(), 1UL, + PADDLE_ENFORCE_GT(shape.size(), 0UL, platform::errors::InvalidArgument( - "TensorRT's tensor input requires at least 2 " + "TensorRT's tensor input requires at least 1 " "dimensions, but input %s has %d dims.", input, shape.size())); PADDLE_ENFORCE_LE(shape.size(), 4UL, diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 2ad81405e5ed83e29a45e3cd076360d69c2691a4..32a769b678c2baafdfd7d665bdb7f931c6b8e865 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -2,7 +2,7 @@ nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu trt_plugin_factory.cc gelu_op_plugin.cu pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu - stack_op_plugin.cu convert_mask_plugin.cu + stack_op_plugin.cu convert_mask_plugin.cu special_slice_plugin.cu instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu hard_swish_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu new file mode 100644 index 0000000000000000000000000000000000000000..14ebc254bee09181e10bb6a68b77e65cca6c9f64 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu @@ -0,0 +1,174 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) +SpecialSlicePluginDynamic::SpecialSlicePluginDynamic() {} + +SpecialSlicePluginDynamic::SpecialSlicePluginDynamic(void const* serial_data, + size_t serial_length) {} + +SpecialSlicePluginDynamic::~SpecialSlicePluginDynamic() {} + +nvinfer1::IPluginV2DynamicExt* SpecialSlicePluginDynamic::clone() const { + return new SpecialSlicePluginDynamic(); +} + +const char* SpecialSlicePluginDynamic::getPluginType() const { + return "special_slice_plugin"; +} + +int SpecialSlicePluginDynamic::getNbOutputs() const { return 1; } + +int SpecialSlicePluginDynamic::initialize() { return 0; } + +size_t SpecialSlicePluginDynamic::getSerializationSize() const { + size_t serialize_size = 0; + return serialize_size; +} + +void SpecialSlicePluginDynamic::serialize(void* buffer) const {} + +nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) { + nvinfer1::DimsExprs output(inputs[0]); + auto one = expr_builder.constant(1); + output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB, + *inputs[1].d[0], *one); + + return output; +} + +void SpecialSlicePluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {} + +size_t SpecialSlicePluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc* inputs, int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const { + return 0; +} + +void SpecialSlicePluginDynamic::destroy() { delete this; } + +void SpecialSlicePluginDynamic::terminate() {} + +bool SpecialSlicePluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* desc, int nb_inputs, + int nb_outputs) { + if (pos == 0) // slice tensor + return (desc[pos].type == + nvinfer1::DataType::kHALF); // || desc[pos].type == + // nvinfer1::DataType::kFLOAT); + + if (pos == 1) // cu_seqlen + return (desc[pos].type == nvinfer1::DataType::kINT32 && + desc[pos].format == nvinfer1::TensorFormat::kLINEAR); + + return (desc[pos].type == nvinfer1::DataType::kHALF); // || desc[pos].type == + // nvinfer1::DataType::kFLOAT); +} + +nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* input_types, int nb_inputs) const { + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The index should be equal to 0")); + return input_types[0]; +} + +template +__global__ void SpecialSliceKernel(const T* slice_input, + const int32_t* cu_seqlens, T* output) { + const int hidden = blockDim.x; + const int batch = blockIdx.x; + + output[batch * hidden + threadIdx.x] = + slice_input[cu_seqlens[batch] * hidden + threadIdx.x]; +} + +int SpecialSlicePluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs, + void* const* outputs, void* workspace, cudaStream_t stream) { + auto input_dims = input_desc[0].dims; // (sum(S), 768, 1, 1) + auto out_dims = output_desc[0].dims; // (batch, 768, 1, 1) + + assert(input_desc[0].type == nvinfer1::DataType::kHALF); + + const int32_t hidden = input_dims.d[1]; + const int num_blocks = out_dims.d[0]; // batch size + const int num_threads = hidden; + + const half* slice_input = static_cast(inputs[0]); + const int32_t* cu_seqlens = static_cast(inputs[1]); + half* output = static_cast(outputs[0]); + + SpecialSliceKernel<<>>( + slice_input, cu_seqlens, output); + + return cudaGetLastError() != cudaSuccess; +} + +SpecialSlicePluginDynamicCreator::SpecialSlicePluginDynamicCreator() {} + +const char* SpecialSlicePluginDynamicCreator::getPluginName() const { + return "stack_plugin"; +} + +const char* SpecialSlicePluginDynamicCreator::getPluginVersion() const { + return "1"; +} + +const nvinfer1::PluginFieldCollection* +SpecialSlicePluginDynamicCreator::getFieldNames() { + return &field_collection_; +} + +nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) { + return new SpecialSlicePluginDynamic(); +} + +nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::deserializePlugin( + const char* name, const void* serial_data, size_t serial_length) { + auto plugin = new SpecialSlicePluginDynamic(serial_data, serial_length); + return plugin; +} + +void SpecialSlicePluginDynamicCreator::setPluginNamespace( + const char* lib_namespace) { + plugin_namespace_ = lib_namespace; +} + +const char* SpecialSlicePluginDynamicCreator::getPluginNamespace() const { + return plugin_namespace_.c_str(); +} + +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h new file mode 100644 index 0000000000000000000000000000000000000000..438d9e9465c52a8c0288928c1aa3cb79d0371080 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h @@ -0,0 +1,96 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +#if IS_TRT_VERSION_GE(6000) +class SpecialSlicePluginDynamic : public DynamicPluginTensorRT { + public: + SpecialSlicePluginDynamic(); + SpecialSlicePluginDynamic(void const* serial_data, size_t serial_length); + ~SpecialSlicePluginDynamic(); + nvinfer1::IPluginV2DynamicExt* clone() const override; + nvinfer1::DimsExprs getOutputDimensions( + int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, + nvinfer1::IExprBuilder& exprBuilder) override; + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, int nbOutputs) override; + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) override; + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const override; + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) override; + + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const override; + + const char* getPluginType() const override; + int getNbOutputs() const override; + int initialize() override; + void terminate() override; + size_t getSerializationSize() const override; + void serialize(void* buffer) const override; + void destroy() override; + + private: + int axis_; + int num_stack_; +}; + +class SpecialSlicePluginDynamicCreator : public nvinfer1::IPluginCreator { + public: + SpecialSlicePluginDynamicCreator(); + const char* getPluginName() const override; + const char* getPluginVersion() const override; + const nvinfer1::PluginFieldCollection* getFieldNames() override; + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override; + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override; + void setPluginNamespace(const char* lib_namespace) override; + const char* getPluginNamespace() const override; + + private: + std::string plugin_namespace_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; +REGISTER_TRT_PLUGIN_V2(SpecialSlicePluginDynamicCreator); +#endif + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 11b48e0ea91a4c1c5799afaba7367934e331d45c..012cae0ef5b442dce5e5ab0507f845b321eed567 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -264,9 +264,11 @@ class TensorRTEngineOp : public framework::OperatorBase { buffers[bind_index] = static_cast(t.data()); } else if (type == framework::proto::VarType::INT64) { buffers[bind_index] = static_cast(t.data()); + } else if (type == framework::proto::VarType::INT32) { + buffers[bind_index] = static_cast(t.data()); } else { PADDLE_THROW(platform::errors::Fatal( - "The TRT Engine OP only support float and int64_t input.")); + "The TRT Engine OP only support float/int32_t/int64_t input.")); } }