From 974281142a739098a5a88efd1763497b54d5a2a7 Mon Sep 17 00:00:00 2001 From: Pei Yang Date: Thu, 6 Aug 2020 09:10:47 +0800 Subject: [PATCH] [Cherry pick] Trt ernie serialization (#25956) * solve conflict * fix crash when trt not found in python; update unittest model path --- .../tensorrt/convert/emb_eltwise_layernorm.cc | 22 +-- paddle/fluid/inference/tensorrt/engine.h | 20 ++- paddle/fluid/inference/tensorrt/helper.h | 3 + .../plugin/emb_eltwise_layernorm_plugin.cu | 54 ++----- .../plugin/emb_eltwise_layernorm_plugin.h | 137 ++++++++++++++-- .../tensorrt/plugin/gelu_op_plugin.cu | 3 - .../tensorrt/plugin/gelu_op_plugin.h | 78 +++++++--- .../tensorrt/plugin/qkv_to_context_plugin.cu | 4 - .../tensorrt/plugin/qkv_to_context_plugin.h | 91 +++++++++-- .../plugin/skip_layernorm_op_plugin.cu | 8 +- .../plugin/skip_layernorm_op_plugin.h | 110 ++++++++++--- .../inference/tensorrt/plugin/trt_plugin.h | 22 ++- .../tensorrt/plugin/trt_plugin_utils.h | 6 + .../fluid/inference/tests/api/CMakeLists.txt | 19 +++ ...rt_dynamic_shape_ernie_deserialize_test.cc | 146 ++++++++++++++++++ .../tests/api/trt_dynamic_shape_ernie_test.cc | 2 +- .../operators/math/bert_encoder_functor.cu | 1 + paddle/fluid/platform/dynload/tensorrt.cc | 46 +++++- paddle/fluid/platform/dynload/tensorrt.h | 48 +++--- 19 files changed, 644 insertions(+), 176 deletions(-) create mode 100644 paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 5676c3facec..3797a561a49 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { - auto use_fp16 = engine_->WithFp16(); plugin::DynamicPluginTensorRT* plugin = nullptr; - if (use_fp16) { -#ifdef SUPPORTS_CUDA_FP16 - plugin = new plugin::EmbEltwiseLayernormPluginDynamic( - input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, - eps); -#else - plugin = new plugin::EmbEltwiseLayernormPluginDynamic( - input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, - eps); -#endif - } else { - plugin = new plugin::EmbEltwiseLayernormPluginDynamic( - input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, - eps); - } + plugin = new plugin::EmbEltwiseLayernormPluginDynamic( + input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, + eps); layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); } else { PADDLE_THROW(platform::errors::Fatal( diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 03fd7a283b4..9ca0e979bf6 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -200,9 +200,23 @@ class TensorRTEngine { void Deserialize(const std::string& engine_serialized_data) { freshDeviceId(); infer_ptr runtime(createInferRuntime(&logger_)); - infer_engine_.reset(runtime->deserializeCudaEngine( - engine_serialized_data.c_str(), engine_serialized_data.size(), - &inference::Singleton::Global())); + if (with_dynamic_shape_) { +#if IS_TRT_VERSION_GE(6000) + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size(), + nullptr)); +#else + + PADDLE_THROW(platform::errors::PreconditionNotMet( + "To enable dynamic shape support, the TensorRT version should be " + "greater than 6.0.0")); + +#endif + } else { + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size(), + &inference::Singleton::Global())); + } PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed when deserialize engine info.!"); } diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 037dabf5d58..55a57caf9a0 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { return static_cast( dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); } +static nvinfer1::IPluginRegistry* getPluginRegistry() { + return static_cast(dy::getPluginRegistry()); +} // A logger for create TensorRT infer builder. class NaiveLogger : public nvinfer1::ILogger { diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu index 575dfa68e6e..e7f9381e971 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.cu @@ -33,53 +33,29 @@ namespace plugin { template int EmbEltwiseLayernormPluginDynamic::initialize() { - int nb_emb = embs_.size(); - std::vector ptr_vector(nb_emb); - std::vector> emb_fp16(nb_emb); - - if (sizeof(T) == sizeof(float)) { - // FP32 - for (int i = 0; i < nb_emb; ++i) { - ptr_vector[i] = embs_[i]; - } - } else { - // FP16 - for (int i = 0; i < nb_emb; ++i) { - auto emb_size = emb_sizes_[i]; - auto &tmp = emb_fp16[i]; - tmp.resize(emb_size); - - for (int j = 0; j < emb_size; ++j) { - tmp[j] = static_cast(embs_[i][j]); - } - ptr_vector[i] = tmp.data(); - } - } embs_gpu_.resize(embs_.size()); for (int i = 0; i < embs_.size(); i++) { - cudaMalloc(&embs_gpu_[i], sizeof(T) * emb_sizes_[i]); - cudaMemcpy(embs_gpu_[i], ptr_vector[i], emb_sizes_[i] * sizeof(T), - cudaMemcpyHostToDevice); + if (embs_[i]) { + cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]); + cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float), + cudaMemcpyHostToDevice); + } } - cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); - cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float), - cudaMemcpyHostToDevice); - cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); - cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float), - cudaMemcpyHostToDevice); - - return 0; -} + if (bias_) { + cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); + cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float), + cudaMemcpyHostToDevice); + } + if (scale_) { + cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); + cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float), + cudaMemcpyHostToDevice); + } -template -size_t EmbEltwiseLayernormPluginDynamic::getSerializationSize() const { return 0; } -template -void EmbEltwiseLayernormPluginDynamic::serialize(void *buffer) const {} - template nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h index d0b5a4a5d6a..8ac611cd7c6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/emb_eltwise_layernorm_plugin.h @@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { hidden_size_(hidden_size), eps_(eps) {} - EmbEltwiseLayernormPluginDynamic(void const* serialData, - size_t serialLength) {} + EmbEltwiseLayernormPluginDynamic(void const* serial_data, + size_t serial_length) { + DeserializeValue(&serial_data, &serial_length, &emb_sizes_); + + embs_gpu_.resize(emb_sizes_.size()); + embs_.resize(emb_sizes_.size()); + for (size_t i = 0; i < emb_sizes_.size(); i++) { + cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]); + cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float), + cudaMemcpyHostToDevice); + reinterpret_cast(serial_data) += + emb_sizes_[i] * sizeof(float); + serial_length -= emb_sizes_[i] * sizeof(float); + embs_[i] = nullptr; + } + DeserializeValue(&serial_data, &serial_length, &bias_size_); + DeserializeValue(&serial_data, &serial_length, &scale_size_); + + cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); + cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float), + cudaMemcpyHostToDevice); + bias_ = nullptr; + reinterpret_cast(serial_data) += bias_size_ * sizeof(float); + serial_length -= bias_size_ * sizeof(float); + + cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); + cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float), + cudaMemcpyHostToDevice); + scale_ = nullptr; + reinterpret_cast(serial_data) += scale_size_ * sizeof(float); + serial_length -= scale_size_ * sizeof(float); + + DeserializeValue(&serial_data, &serial_length, &hidden_size_); + DeserializeValue(&serial_data, &serial_length, &eps_); + } + nvinfer1::IPluginV2DynamicExt* clone() const override { return new EmbEltwiseLayernormPluginDynamic( embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, @@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { int getNbOutputs() const override { return 1; } int initialize() override; - size_t getSerializationSize() const override; - void serialize(void* buffer) const override; + size_t getSerializationSize() const override { + int sum_num = 0; + sum_num += SerializedSize(emb_sizes_); + + for (size_t i = 0; i < emb_sizes_.size(); i++) { + sum_num += emb_sizes_[i] * sizeof(float); + } + + sum_num += SerializedSize(bias_size_); + sum_num += SerializedSize(scale_size_); + + sum_num += (bias_size_ + scale_size_) * sizeof(float); + sum_num += SerializedSize(hidden_size_); + sum_num += SerializedSize(eps_); + // sum_num += SerializedSize(with_fp16_); + + return sum_num; + } + + void serialize(void* buffer) const override { + // SerializeValue(&buffer, with_fp16_); + SerializeValue(&buffer, emb_sizes_); + for (size_t i = 0; i < emb_sizes_.size(); i++) { + SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]); + } + SerializeValue(&buffer, bias_size_); + SerializeValue(&buffer, scale_size_); + SerializeCudaPointer(&buffer, bias_gpu_, bias_size_); + SerializeCudaPointer(&buffer, scale_gpu_, scale_size_); + SerializeValue(&buffer, hidden_size_); + SerializeValue(&buffer, eps_); + } nvinfer1::DimsExprs getOutputDimensions( int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, nvinfer1::IExprBuilder& expr_builder) override; bool supportsFormatCombination(int pos, - const nvinfer1::PluginTensorDesc* inOut, - int nbInputs, int nbOutputs) override; + const nvinfer1::PluginTensorDesc* in_out, + int nb_inputs, int nb_outputs) override; void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, - int nbInputs, + int nb_inputs, const nvinfer1::DynamicPluginTensorDesc* out, - int nbOutputs) override {} + int nb_outputs) override {} size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, - int nbInputs, + int nb_inputs, const nvinfer1::PluginTensorDesc* outputs, - int nbOutputs) const override { + int nb_outputs) const override { return 0; } - int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, - const nvinfer1::PluginTensorDesc* outputDesc, + int enqueue(const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override; nvinfer1::DataType getOutputDataType(int index, - const nvinfer1::DataType* inputTypes, - int nbInputs) const override; + const nvinfer1::DataType* input_types, + int nb_inputs) const override; void destroy() override { delete this; } @@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { // data on devices float* bias_gpu_; float* scale_gpu_; - std::vector embs_gpu_; + std::vector embs_gpu_; std::vector emb_sizes_; int bias_size_; @@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { int hidden_size_; float eps_; }; + +class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator { + public: + EmbEltwiseLayernormPluginV2Creator() {} + const char* getPluginName() const override { + return "fused_embedding_eltwise_layernorm_plugin"; + } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + return new EmbEltwiseLayernormPluginDynamic(serial_data, + serial_length); + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator); + #endif } // namespace plugin } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu index 6a718d47b15..854ca515db5 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.cu @@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs, // Dynamic Plugin below. #if IS_TRT_VERSION_GE(6000) -size_t GeluPluginDynamic::getSerializationSize() const { return 0; } - -void GeluPluginDynamic::serialize(void* buffer) const {} nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, diff --git a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h index e4dcce3a856..02219bc27a7 100644 --- a/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/gelu_op_plugin.h @@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT { // It was used for tensorrt deserialization. // It should not be called by users. - GeluPlugin(void const* serialData, size_t serialLength) { - deserializeBase(serialData, serialLength); + GeluPlugin(void const* serial_data, size_t serial_length) { + deserializeBase(serial_data, serial_length); } ~GeluPlugin() {} @@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT { bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const override; nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, - int nbInputDims) override; - int enqueue(int batchSize, const void* const* inputs, void** outputs, + int nb_input_dims) override; + int enqueue(int batch_size, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) override; protected: @@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT { class GeluPluginDynamic : public DynamicPluginTensorRT { public: GeluPluginDynamic() {} - GeluPluginDynamic(void const* serialData, size_t serialLength) {} + GeluPluginDynamic(void const* serial_data, size_t serial_length) {} ~GeluPluginDynamic() {} nvinfer1::IPluginV2DynamicExt* clone() const override { @@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT { int getNbOutputs() const override { return 1; } int initialize() override { return 0; } - size_t getSerializationSize() const override; - void serialize(void* buffer) const override; + size_t getSerializationSize() const override { return 0; } + void serialize(void* buffer) const override {} nvinfer1::DimsExprs getOutputDimensions( - int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, - nvinfer1::IExprBuilder& exprBuilder) override; + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) override; bool supportsFormatCombination(int pos, - const nvinfer1::PluginTensorDesc* inOut, - int nbInputs, int nbOutputs) override; + const nvinfer1::PluginTensorDesc* in_out, + int nb_inputs, int nb_outputs) override; void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, - int nbInputs, + int nb_inputs, const nvinfer1::DynamicPluginTensorDesc* out, - int nbOutputs) override {} + int nb_outputs) override {} size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, - int nbInputs, + int nb_inputs, const nvinfer1::PluginTensorDesc* outputs, - int nbOutputs) const override { + int nb_outputs) const override { return 0; } - int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, - const nvinfer1::PluginTensorDesc* outputDesc, + int enqueue(const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override; nvinfer1::DataType getOutputDataType(int index, - const nvinfer1::DataType* inputTypes, - int nbInputs) const override; + const nvinfer1::DataType* input_types, + int nb_inputs) const override; void destroy() override { delete this; } }; + +class GeluPluginV2Creator : public nvinfer1::IPluginCreator { + public: + GeluPluginV2Creator() {} + const char* getPluginName() const override { return "gelu_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new GeluPluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_{0, nullptr}; + std::vector plugin_attributes_; +}; + +REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index fe3ea180593..5014a7db981 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -141,10 +141,6 @@ inline void TransposeQKV(const int batch, const int seq_len, int QkvToContextPluginDynamic::initialize() { return 0; } -size_t QkvToContextPluginDynamic::getSerializationSize() const { return 0; } - -void QkvToContextPluginDynamic::serialize(void *buffer) const {} - nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, nvinfer1::IExprBuilder &expr_builder) { diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h index 18c5d27fc45..72a2732ae20 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h @@ -1,3 +1,17 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); @@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { scale_(scale), ban_fp16_(ban_fp16) {} - QkvToContextPluginDynamic(void const* serialData, size_t serialLength) {} + QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) { + DeserializeValue(&serial_data, &serial_length, &hidden_); + DeserializeValue(&serial_data, &serial_length, &head_number_); + DeserializeValue(&serial_data, &serial_length, &head_size_); + DeserializeValue(&serial_data, &serial_length, &scale_); + DeserializeValue(&serial_data, &serial_length, &ban_fp16_); + } nvinfer1::IPluginV2DynamicExt* clone() const override { return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_, scale_, ban_fp16_); @@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { int getNbOutputs() const override { return 1; } int initialize() override; - size_t getSerializationSize() const override; - void serialize(void* buffer) const override; + size_t getSerializationSize() const override { + return SerializedSize(hidden_) + SerializedSize(head_number_) + + SerializedSize(head_size_) + SerializedSize(scale_) + + SerializedSize(ban_fp16_); + } + void serialize(void* buffer) const override { + SerializeValue(&buffer, hidden_); + SerializeValue(&buffer, head_number_); + SerializeValue(&buffer, head_size_); + SerializeValue(&buffer, scale_); + SerializeValue(&buffer, ban_fp16_); + } nvinfer1::DimsExprs getOutputDimensions( int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, nvinfer1::IExprBuilder& expr_builder) override; bool supportsFormatCombination(int pos, - const nvinfer1::PluginTensorDesc* inOut, - int nbInputs, int nbOutputs) override; + const nvinfer1::PluginTensorDesc* in_out, + int nb_inputs, int nb_outputs) override; void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, - int nbInputs, + int nb_inputs, const nvinfer1::DynamicPluginTensorDesc* out, - int nbOutputs) override {} + int nb_outputs) override {} size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, - int nbInputs, + int nb_inputs, const nvinfer1::PluginTensorDesc* outputs, - int nbOutputs) const override { + int nb_outputs) const override { return 0; } @@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override; nvinfer1::DataType getOutputDataType(int index, - const nvinfer1::DataType* inputTypes, - int nbInputs) const override; + const nvinfer1::DataType* input_types, + int nb_inputs) const override; void destroy() override { delete this; } @@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { float scale_; bool ban_fp16_; }; + +class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator { + public: + QkvToContextPluginV2Creator() {} + const char* getPluginName() const override { return "qkv_to_context_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new QkvToContextPluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_; + std::vector plugin_attributes_; +}; +REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu index 4fb9bbc1ff8..c51dae5e00c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.cu @@ -32,18 +32,14 @@ namespace plugin { int SkipLayerNormPluginDynamic::initialize() { cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); - cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float), + cudaMemcpy(bias_gpu_, bias_.data(), bias_size_ * sizeof(float), cudaMemcpyHostToDevice); cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); - cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float), + cudaMemcpy(scale_gpu_, scale_.data(), scale_size_ * sizeof(float), cudaMemcpyHostToDevice); return 0; } -size_t SkipLayerNormPluginDynamic::getSerializationSize() const { return 0; } - -void SkipLayerNormPluginDynamic::serialize(void *buffer) const {} - nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions( int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, nvinfer1::IExprBuilder &expr_builder) { diff --git a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h index ede90d7db16..8fe1edc4bf0 100644 --- a/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/skip_layernorm_op_plugin.h @@ -29,61 +29,84 @@ namespace plugin { #if IS_TRT_VERSION_GE(6000) class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { public: - explicit SkipLayerNormPluginDynamic(float* bias, float* scale, int bias_size, - int scale_size, const float eps, - bool ban_fp16) - : bias_(bias), - scale_(scale), - bias_size_(bias_size), + explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale, + int bias_size, int scale_size, + const float eps, bool ban_fp16) + : bias_size_(bias_size), scale_size_(scale_size), eps_(eps), - ban_fp16_(ban_fp16) {} - SkipLayerNormPluginDynamic(void const* serialData, size_t serialLength) {} + ban_fp16_(ban_fp16) { + bias_.resize(bias_size); + scale_.resize(scale_size); + std::copy(bias, bias + bias_size, bias_.data()); + std::copy(scale, scale + scale_size, scale_.data()); + } + SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length) { + DeserializeValue(&serial_data, &serial_length, &bias_); + DeserializeValue(&serial_data, &serial_length, &scale_); + DeserializeValue(&serial_data, &serial_length, &bias_size_); + DeserializeValue(&serial_data, &serial_length, &scale_size_); + DeserializeValue(&serial_data, &serial_length, &eps_); + DeserializeValue(&serial_data, &serial_length, &ban_fp16_); + } + nvinfer1::IPluginV2DynamicExt* clone() const override { - return new SkipLayerNormPluginDynamic(bias_, scale_, bias_size_, - scale_size_, eps_, ban_fp16_); + return new SkipLayerNormPluginDynamic( + bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_); } const char* getPluginType() const override { return "skip_layernorm_plugin"; } int getNbOutputs() const override { return 1; } int initialize() override; - size_t getSerializationSize() const override; - void serialize(void* buffer) const override; + size_t getSerializationSize() const override { + size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) + + SerializedSize(bias_size_) + SerializedSize(scale_size_) + + SerializedSize(eps_) + SerializedSize(eps_); + return ser_size; + } + void serialize(void* buffer) const override { + SerializeValue(&buffer, bias_); + SerializeValue(&buffer, scale_); + SerializeValue(&buffer, bias_size_); + SerializeValue(&buffer, scale_size_); + SerializeValue(&buffer, eps_); + SerializeValue(&buffer, ban_fp16_); + } nvinfer1::DimsExprs getOutputDimensions( int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, nvinfer1::IExprBuilder& expr_builder) override; bool supportsFormatCombination(int pos, - const nvinfer1::PluginTensorDesc* inOut, - int nbInputs, int nbOutputs) override; + const nvinfer1::PluginTensorDesc* in_out, + int nb_inputs, int nb_outputs) override; void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, - int nbInputs, + int nb_inputs, const nvinfer1::DynamicPluginTensorDesc* out, - int nbOutputs) override {} + int nb_outputs) override {} size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, - int nbInputs, + int nb_inputs, const nvinfer1::PluginTensorDesc* outputs, - int nbOutputs) const override { + int nb_outputs) const override { return 0; } - int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, - const nvinfer1::PluginTensorDesc* outputDesc, + int enqueue(const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs, void* const* outputs, void* workspace, cudaStream_t stream) override; nvinfer1::DataType getOutputDataType(int index, - const nvinfer1::DataType* inputTypes, - int nbInputs) const override; + const nvinfer1::DataType* input_types, + int nb_inputs) const override; void destroy() override { delete this; } private: - float* bias_; - float* scale_; + std::vector bias_; + std::vector scale_; float* bias_gpu_; float* scale_gpu_; @@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { float eps_; bool ban_fp16_; }; + +class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator { + public: + SkipLayerNormPluginV2Creator() {} + const char* getPluginName() const override { return "skip_layernorm_plugin"; } + + const char* getPluginVersion() const override { return "1"; } + + const nvinfer1::PluginFieldCollection* getFieldNames() override { + return &field_collection_; + } + + nvinfer1::IPluginV2* createPlugin( + const char* name, const nvinfer1::PluginFieldCollection* fc) override { + return nullptr; + } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) override { + auto plugin = new SkipLayerNormPluginDynamic(serial_data, serial_length); + return plugin; + } + + void setPluginNamespace(const char* lib_namespace) override { + plugin_namespace_ = lib_namespace; + } + + const char* getPluginNamespace() const override { + return plugin_namespace_.c_str(); + } + + private: + std::string plugin_namespace_; + std::string plugin_name_; + nvinfer1::PluginFieldCollection field_collection_; + std::vector plugin_attributes_; +}; +REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator); #endif } // namespace plugin diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 0fc0f7bf35d..33eec618ff6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -175,11 +175,29 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { void serializeBase(void*& buffer) const; // NOLINT private: - std::string name_space_{"paddle_trt"}; - std::string plugin_base_{"plugin_dynamic"}; + std::string name_space_; + std::string plugin_base_; }; #endif +template +class TrtPluginRegistrarV2 { + public: + TrtPluginRegistrarV2() { + static auto func_ptr = getPluginRegistry(); + if (func_ptr != nullptr) { + func_ptr->registerCreator(creator, ""); + } + } + + private: + T creator; +}; + +#define REGISTER_TRT_PLUGIN_V2(name) \ + static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2 \ + plugin_registrar_##name {} + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h index 1cae4ccae4c..18037179c7b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h @@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size, return details::Serializer::Deserialize(buffer, buffer_size, value); } +template +inline void SerializeCudaPointer(void** buffer, T* value, int size) { + cudaMemcpy((*buffer), value, size * sizeof(T), cudaMemcpyDeviceToHost); + reinterpret_cast(*buffer) += size * sizeof(T); +} + } // namespace plugin } // namespace tensorrt } // namespace inference diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 5fca1d0b6ed..8274ca86dc4 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -420,6 +420,25 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4) + + set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/") + if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}) + inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") + endif() + + inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) + + set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_model_4_serialized/") + if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL}) + inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz") + endif() + + inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/ernie_model_4_serialized) + endif() set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc new file mode 100644 index 00000000000..6526b874365 --- /dev/null +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc @@ -0,0 +1,146 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/inference/tests/api/trt_test_helper.h" + +namespace paddle { +namespace inference { + +void run(const AnalysisConfig& config, std::vector* out_data) { + auto predictor = CreatePaddlePredictor(config); + auto input_names = predictor->GetInputNames(); + + int run_batch = 1; + const int run_seq_len = 128; + + std::vector tmp_input; + std::vector tmp_four_input; + tmp_input.reserve(run_batch * run_seq_len); + tmp_four_input.reserve(run_batch * run_seq_len); + + int64_t i0[run_seq_len] = { + 1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, + 4095, 1902, 4, 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, + 75, 201, 340, 9, 14, 44, 486, 218, 1140, 279, 12043, 2}; + int64_t i1[run_seq_len] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + int64_t i2[run_seq_len] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, 38, 39}; + float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + + // first input + auto input_t = predictor->GetInputTensor(input_names[0]); + input_t->Reshape({run_batch, run_seq_len, 1}); + input_t->copy_from_cpu(i0); + + // second input + auto input_t2 = predictor->GetInputTensor(input_names[1]); + input_t2->Reshape({run_batch, run_seq_len, 1}); + input_t2->copy_from_cpu(i1); + + // third input. + auto input_t3 = predictor->GetInputTensor(input_names[2]); + input_t3->Reshape({run_batch, run_seq_len, 1}); + input_t3->copy_from_cpu(i2); + + auto input_t4 = predictor->GetInputTensor(input_names[3]); + input_t4->Reshape({run_batch, run_seq_len, 1}); + input_t4->copy_from_cpu(i3); + + ASSERT_TRUE(predictor->ZeroCopyRun()); + + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputTensor(output_names[0]); + std::vector output_shape = output_t->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + out_data->resize(out_num); + output_t->copy_to_cpu(out_data->data()); +} + +void trt_ernie(bool with_fp16, std::vector result) { + AnalysisConfig config; + std::string model_dir = FLAGS_infer_model; + SetConfig(&config, model_dir, true /* use_gpu */); + + config.SwitchUseFeedFetchOps(false); + + int head_number = 12; + int batch = 1; + int min_seq_len = 1; + int max_seq_len = 128; + int opt_seq_len = 128; + + std::vector min_shape = {batch, min_seq_len, 1}; + std::vector max_shape = {batch, max_seq_len, 1}; + std::vector opt_shape = {batch, opt_seq_len, 1}; + // Set the input's min, max, opt shape + std::map> min_input_shape = { + {"read_file_0.tmp_0", min_shape}, + {"read_file_0.tmp_1", min_shape}, + {"read_file_0.tmp_2", min_shape}, + {"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}}; + std::map> max_input_shape = { + {"read_file_0.tmp_0", max_shape}, + {"read_file_0.tmp_1", max_shape}, + {"read_file_0.tmp_2", max_shape}, + {"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}}; + std::map> opt_input_shape = { + {"read_file_0.tmp_0", opt_shape}, + {"read_file_0.tmp_1", opt_shape}, + {"read_file_0.tmp_2", opt_shape}, + {"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}}; + + auto precision = AnalysisConfig::Precision::kFloat32; + if (with_fp16) { + precision = AnalysisConfig::Precision::kHalf; + } + config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false); + config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, + opt_input_shape); + std::vector out_data; + run(config, &out_data); + for (size_t i = 0; i < out_data.size(); i++) { + EXPECT_NEAR(result[i], out_data[i], 1e-6); + } +} + +TEST(AnalysisPredictor, no_fp16) { + std::vector result = {0.597841, 0.219972, 0.182187}; + trt_ernie(false, result); +} + +TEST(AnalysisPredictor, fp16) { +#ifdef SUPPORTS_CUDA_FP16 + std::vector result = {0.598336, 0.219558, 0.182106}; + trt_ernie(true, result); +#endif +} + +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc index 5fbf0867ba2..babe9977cd5 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc @@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector result) { if (with_fp16) { precision = AnalysisConfig::Precision::kHalf; } - config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, true); + config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, false); config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape); std::vector out_data; diff --git a/paddle/fluid/operators/math/bert_encoder_functor.cu b/paddle/fluid/operators/math/bert_encoder_functor.cu index 5d47c066d6e..4004ad401ee 100644 --- a/paddle/fluid/operators/math/bert_encoder_functor.cu +++ b/paddle/fluid/operators/math/bert_encoder_functor.cu @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc index f3c8e27944c..c9c3a9456b7 100644 --- a/paddle/fluid/platform/dynload/tensorrt.cc +++ b/paddle/fluid/platform/dynload/tensorrt.cc @@ -13,18 +13,62 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/tensorrt.h" +#include namespace paddle { namespace platform { namespace dynload { std::once_flag tensorrt_dso_flag; -void *tensorrt_dso_handle; +void* tensorrt_dso_handle; #define DEFINE_WRAP(__name) DynLoad__##__name __name TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP); +void* GetTensorRtHandle() { +#if defined(__APPLE__) || defined(__OSX__) + std::string dso_name = "libnvinfer.dylib"; +#elif defined(_WIN32) + std::string dso_name = "nvinfer.dll"; +#else + std::string dso_name = "libnvinfer.so"; +#endif + +#if !defined(_WIN32) + int dynload_flags = RTLD_LAZY | RTLD_LOCAL; +#else + int dynload_flags = 0; +#endif // !_WIN32 + + void* dso_handle = dlopen(dso_name.c_str(), dynload_flags); + + if (nullptr == dso_handle) { + auto error_msg = + "TensorRT dynamic library (%s) that Paddle depends on is not " + "configured correctly. (error code is %s)\n" + " Suggestions:\n" + " 1. Check if TensorRT " + "is installed correctly and its version is matched with paddlepaddle " + "you installed.\n" + " 2. Configure TensorRT dynamic library environment variables as " + "follows:\n" + " - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n" + " - Windows: set PATH by `set PATH=XXX;%PATH%`\n" + " - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` " + "[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is " + "impossible unless System Integrity Protection (SIP) is disabled.]"; +#if !defined(_WIN32) + auto errorno = dlerror(); +#else + auto errorno = GetLastError(); +#endif // !_WIN32 + std::cerr << string::Sprintf(error_msg, dso_name, errorno); + } + + return dso_handle; +} + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index 4c7ba0f054c..34ad1e74588 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -27,42 +27,32 @@ namespace paddle { namespace platform { namespace dynload { +void* GetTensorRtHandle(); + extern std::once_flag tensorrt_dso_flag; extern void* tensorrt_dso_handle; -#ifdef PADDLE_USE_DSO - -#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ - using tensorrt_func = decltype(&::__name); \ - std::call_once(tensorrt_dso_flag, []() { \ - tensorrt_dso_handle = \ - paddle::platform::dynload::GetTensorRtDsoHandle(); \ - PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \ - }); \ - static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ - PADDLE_ENFORCE(p_##__name, "load %s failed", #__name); \ - return reinterpret_cast(p_##__name)(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + using tensorrt_func = decltype(&::__name); \ + std::call_once(tensorrt_dso_flag, []() { \ + tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \ + }); \ + static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ + if (p_##__name == nullptr) { \ + return nullptr; \ + } \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ extern DynLoad__##__name __name -#else -#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - tensorrtResult_t operator()(Args... args) { \ - return __name(args...); \ - } \ - }; \ - extern DynLoad__##__name __name -#endif - #define TENSORRT_RAND_ROUTINE_EACH(__macro) \ __macro(createInferBuilder_INTERNAL); \ - __macro(createInferRuntime_INTERNAL); + __macro(createInferRuntime_INTERNAL); \ + __macro(getPluginRegistry); TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP) -- GitLab