未验证 提交 97428114 编写于 作者: P Pei Yang 提交者: GitHub

[Cherry pick] Trt ernie serialization (#25956)

* solve conflict

* fix crash when trt not found in python; update unittest model path
上级 897305d1
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -83,23 +80,10 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
nvinfer1::ILayer* layer = nullptr; nvinfer1::ILayer* layer = nullptr;
if (engine_->with_dynamic_shape()) { if (engine_->with_dynamic_shape()) {
auto use_fp16 = engine_->WithFp16();
plugin::DynamicPluginTensorRT* plugin = nullptr; plugin::DynamicPluginTensorRT* plugin = nullptr;
if (use_fp16) { plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
#ifdef SUPPORTS_CUDA_FP16 input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<half>( eps);
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps);
#else
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps);
#endif
} else {
plugin = new plugin::EmbEltwiseLayernormPluginDynamic<float>(
input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
eps);
}
layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin); layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
} else { } else {
PADDLE_THROW(platform::errors::Fatal( PADDLE_THROW(platform::errors::Fatal(
......
...@@ -200,9 +200,23 @@ class TensorRTEngine { ...@@ -200,9 +200,23 @@ class TensorRTEngine {
void Deserialize(const std::string& engine_serialized_data) { void Deserialize(const std::string& engine_serialized_data) {
freshDeviceId(); freshDeviceId();
infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_)); infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
infer_engine_.reset(runtime->deserializeCudaEngine( if (with_dynamic_shape_) {
engine_serialized_data.c_str(), engine_serialized_data.size(), #if IS_TRT_VERSION_GE(6000)
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global())); infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size(),
nullptr));
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"To enable dynamic shape support, the TensorRT version should be "
"greater than 6.0.0"));
#endif
} else {
infer_engine_.reset(runtime->deserializeCudaEngine(
engine_serialized_data.c_str(), engine_serialized_data.size(),
&inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
}
PADDLE_ENFORCE(infer_engine_ != nullptr, PADDLE_ENFORCE(infer_engine_ != nullptr,
"build cuda engine failed when deserialize engine info.!"); "build cuda engine failed when deserialize engine info.!");
} }
......
...@@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { ...@@ -56,6 +56,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) {
return static_cast<nvinfer1::IRuntime*>( return static_cast<nvinfer1::IRuntime*>(
dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION)); dy::createInferRuntime_INTERNAL(logger, NV_TENSORRT_VERSION));
} }
static nvinfer1::IPluginRegistry* getPluginRegistry() {
return static_cast<nvinfer1::IPluginRegistry*>(dy::getPluginRegistry());
}
// A logger for create TensorRT infer builder. // A logger for create TensorRT infer builder.
class NaiveLogger : public nvinfer1::ILogger { class NaiveLogger : public nvinfer1::ILogger {
......
...@@ -33,53 +33,29 @@ namespace plugin { ...@@ -33,53 +33,29 @@ namespace plugin {
template <typename T> template <typename T>
int EmbEltwiseLayernormPluginDynamic<T>::initialize() { int EmbEltwiseLayernormPluginDynamic<T>::initialize() {
int nb_emb = embs_.size();
std::vector<void *> ptr_vector(nb_emb);
std::vector<std::vector<half>> emb_fp16(nb_emb);
if (sizeof(T) == sizeof(float)) {
// FP32
for (int i = 0; i < nb_emb; ++i) {
ptr_vector[i] = embs_[i];
}
} else {
// FP16
for (int i = 0; i < nb_emb; ++i) {
auto emb_size = emb_sizes_[i];
auto &tmp = emb_fp16[i];
tmp.resize(emb_size);
for (int j = 0; j < emb_size; ++j) {
tmp[j] = static_cast<half>(embs_[i][j]);
}
ptr_vector[i] = tmp.data();
}
}
embs_gpu_.resize(embs_.size()); embs_gpu_.resize(embs_.size());
for (int i = 0; i < embs_.size(); i++) { for (int i = 0; i < embs_.size(); i++) {
cudaMalloc(&embs_gpu_[i], sizeof(T) * emb_sizes_[i]); if (embs_[i]) {
cudaMemcpy(embs_gpu_[i], ptr_vector[i], emb_sizes_[i] * sizeof(T), cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
cudaMemcpyHostToDevice); cudaMemcpy(embs_gpu_[i], embs_[i], emb_sizes_[i] * sizeof(float),
cudaMemcpyHostToDevice);
}
} }
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); if (bias_) {
cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float), cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
cudaMemcpyHostToDevice); cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float),
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); cudaMemcpyHostToDevice);
cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float), }
cudaMemcpyHostToDevice); if (scale_) {
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
return 0; cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float),
} cudaMemcpyHostToDevice);
}
template <typename T>
size_t EmbEltwiseLayernormPluginDynamic<T>::getSerializationSize() const {
return 0; return 0;
} }
template <typename T>
void EmbEltwiseLayernormPluginDynamic<T>::serialize(void *buffer) const {}
template <typename T> template <typename T>
nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions( nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
......
...@@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -44,8 +44,42 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
hidden_size_(hidden_size), hidden_size_(hidden_size),
eps_(eps) {} eps_(eps) {}
EmbEltwiseLayernormPluginDynamic(void const* serialData, EmbEltwiseLayernormPluginDynamic(void const* serial_data,
size_t serialLength) {} size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &emb_sizes_);
embs_gpu_.resize(emb_sizes_.size());
embs_.resize(emb_sizes_.size());
for (size_t i = 0; i < emb_sizes_.size(); i++) {
cudaMalloc(&embs_gpu_[i], sizeof(float) * emb_sizes_[i]);
cudaMemcpy(embs_gpu_[i], serial_data, emb_sizes_[i] * sizeof(float),
cudaMemcpyHostToDevice);
reinterpret_cast<char const*&>(serial_data) +=
emb_sizes_[i] * sizeof(float);
serial_length -= emb_sizes_[i] * sizeof(float);
embs_[i] = nullptr;
}
DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_);
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
cudaMemcpy(bias_gpu_, serial_data, bias_size_ * sizeof(float),
cudaMemcpyHostToDevice);
bias_ = nullptr;
reinterpret_cast<char const*&>(serial_data) += bias_size_ * sizeof(float);
serial_length -= bias_size_ * sizeof(float);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
cudaMemcpy(scale_gpu_, serial_data, scale_size_ * sizeof(float),
cudaMemcpyHostToDevice);
scale_ = nullptr;
reinterpret_cast<char const*&>(serial_data) += scale_size_ * sizeof(float);
serial_length -= scale_size_ * sizeof(float);
DeserializeValue(&serial_data, &serial_length, &hidden_size_);
DeserializeValue(&serial_data, &serial_length, &eps_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new EmbEltwiseLayernormPluginDynamic( return new EmbEltwiseLayernormPluginDynamic(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
...@@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -58,36 +92,66 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
size_t getSerializationSize() const override; size_t getSerializationSize() const override {
void serialize(void* buffer) const override; int sum_num = 0;
sum_num += SerializedSize(emb_sizes_);
for (size_t i = 0; i < emb_sizes_.size(); i++) {
sum_num += emb_sizes_[i] * sizeof(float);
}
sum_num += SerializedSize(bias_size_);
sum_num += SerializedSize(scale_size_);
sum_num += (bias_size_ + scale_size_) * sizeof(float);
sum_num += SerializedSize(hidden_size_);
sum_num += SerializedSize(eps_);
// sum_num += SerializedSize(with_fp16_);
return sum_num;
}
void serialize(void* buffer) const override {
// SerializeValue(&buffer, with_fp16_);
SerializeValue(&buffer, emb_sizes_);
for (size_t i = 0; i < emb_sizes_.size(); i++) {
SerializeCudaPointer(&buffer, embs_gpu_[i], emb_sizes_[i]);
}
SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_);
SerializeCudaPointer(&buffer, bias_gpu_, bias_size_);
SerializeCudaPointer(&buffer, scale_gpu_, scale_size_);
SerializeValue(&buffer, hidden_size_);
SerializeValue(&buffer, eps_);
}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override; nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* in_out,
int nbInputs, int nbOutputs) override; int nb_inputs, int nb_outputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nb_inputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nb_outputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nb_inputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nb_outputs) const override {
return 0; return 0;
} }
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
const nvinfer1::PluginTensorDesc* outputDesc, const nvinfer1::PluginTensorDesc* output_desc,
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override; cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index, nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* input_types,
int nbInputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override { delete this; }
...@@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -99,7 +163,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
// data on devices // data on devices
float* bias_gpu_; float* bias_gpu_;
float* scale_gpu_; float* scale_gpu_;
std::vector<T*> embs_gpu_; std::vector<float*> embs_gpu_;
std::vector<int> emb_sizes_; std::vector<int> emb_sizes_;
int bias_size_; int bias_size_;
...@@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -107,6 +171,49 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
int hidden_size_; int hidden_size_;
float eps_; float eps_;
}; };
class EmbEltwiseLayernormPluginV2Creator : public nvinfer1::IPluginCreator {
public:
EmbEltwiseLayernormPluginV2Creator() {}
const char* getPluginName() const override {
return "fused_embedding_eltwise_layernorm_plugin";
}
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
return new EmbEltwiseLayernormPluginDynamic<float>(serial_data,
serial_length);
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_;
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(EmbEltwiseLayernormPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
......
...@@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs, ...@@ -132,9 +132,6 @@ int GeluPlugin::enqueue(int batch_size, const void* const* inputs,
// Dynamic Plugin below. // Dynamic Plugin below.
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
size_t GeluPluginDynamic::getSerializationSize() const { return 0; }
void GeluPluginDynamic::serialize(void* buffer) const {}
nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs GeluPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
......
...@@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT { ...@@ -30,8 +30,8 @@ class GeluPlugin : public PluginTensorRT {
// It was used for tensorrt deserialization. // It was used for tensorrt deserialization.
// It should not be called by users. // It should not be called by users.
GeluPlugin(void const* serialData, size_t serialLength) { GeluPlugin(void const* serial_data, size_t serial_length) {
deserializeBase(serialData, serialLength); deserializeBase(serial_data, serial_length);
} }
~GeluPlugin() {} ~GeluPlugin() {}
...@@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT { ...@@ -43,8 +43,8 @@ class GeluPlugin : public PluginTensorRT {
bool supportsFormat(nvinfer1::DataType type, bool supportsFormat(nvinfer1::DataType type,
nvinfer1::PluginFormat format) const override; nvinfer1::PluginFormat format) const override;
nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
int nbInputDims) override; int nb_input_dims) override;
int enqueue(int batchSize, const void* const* inputs, void** outputs, int enqueue(int batch_size, const void* const* inputs, void** outputs,
void* workspace, cudaStream_t stream) override; void* workspace, cudaStream_t stream) override;
protected: protected:
...@@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT { ...@@ -64,7 +64,7 @@ class GeluPlugin : public PluginTensorRT {
class GeluPluginDynamic : public DynamicPluginTensorRT { class GeluPluginDynamic : public DynamicPluginTensorRT {
public: public:
GeluPluginDynamic() {} GeluPluginDynamic() {}
GeluPluginDynamic(void const* serialData, size_t serialLength) {} GeluPluginDynamic(void const* serial_data, size_t serial_length) {}
~GeluPluginDynamic() {} ~GeluPluginDynamic() {}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
...@@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT { ...@@ -75,39 +75,79 @@ class GeluPluginDynamic : public DynamicPluginTensorRT {
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override { return 0; } int initialize() override { return 0; }
size_t getSerializationSize() const override; size_t getSerializationSize() const override { return 0; }
void serialize(void* buffer) const override; void serialize(void* buffer) const override {}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& exprBuilder) override; nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* in_out,
int nbInputs, int nbOutputs) override; int nb_inputs, int nb_outputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nb_inputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nb_outputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nb_inputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nb_outputs) const override {
return 0; return 0;
} }
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
const nvinfer1::PluginTensorDesc* outputDesc, const nvinfer1::PluginTensorDesc* output_desc,
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override; cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index, nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* input_types,
int nbInputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override { delete this; }
}; };
class GeluPluginV2Creator : public nvinfer1::IPluginCreator {
public:
GeluPluginV2Creator() {}
const char* getPluginName() const override { return "gelu_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new GeluPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(GeluPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -141,10 +141,6 @@ inline void TransposeQKV(const int batch, const int seq_len, ...@@ -141,10 +141,6 @@ inline void TransposeQKV(const int batch, const int seq_len,
int QkvToContextPluginDynamic::initialize() { return 0; } int QkvToContextPluginDynamic::initialize() { return 0; }
size_t QkvToContextPluginDynamic::getSerializationSize() const { return 0; }
void QkvToContextPluginDynamic::serialize(void *buffer) const {}
nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs QkvToContextPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) { nvinfer1::IExprBuilder &expr_builder) {
......
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
...@@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -37,7 +51,13 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
scale_(scale), scale_(scale),
ban_fp16_(ban_fp16) {} ban_fp16_(ban_fp16) {}
QkvToContextPluginDynamic(void const* serialData, size_t serialLength) {} QkvToContextPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &hidden_);
DeserializeValue(&serial_data, &serial_length, &head_number_);
DeserializeValue(&serial_data, &serial_length, &head_size_);
DeserializeValue(&serial_data, &serial_length, &scale_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_, return new QkvToContextPluginDynamic(hidden_, head_number_, head_size_,
scale_, ban_fp16_); scale_, ban_fp16_);
...@@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -47,26 +67,36 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
size_t getSerializationSize() const override; size_t getSerializationSize() const override {
void serialize(void* buffer) const override; return SerializedSize(hidden_) + SerializedSize(head_number_) +
SerializedSize(head_size_) + SerializedSize(scale_) +
SerializedSize(ban_fp16_);
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, hidden_);
SerializeValue(&buffer, head_number_);
SerializeValue(&buffer, head_size_);
SerializeValue(&buffer, scale_);
SerializeValue(&buffer, ban_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override; nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* in_out,
int nbInputs, int nbOutputs) override; int nb_inputs, int nb_outputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nb_inputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nb_outputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nb_inputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nb_outputs) const override {
return 0; return 0;
} }
...@@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -75,8 +105,8 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override; cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index, nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* input_types,
int nbInputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override { delete this; }
...@@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT { ...@@ -87,6 +117,45 @@ class QkvToContextPluginDynamic : public DynamicPluginTensorRT {
float scale_; float scale_;
bool ban_fp16_; bool ban_fp16_;
}; };
class QkvToContextPluginV2Creator : public nvinfer1::IPluginCreator {
public:
QkvToContextPluginV2Creator() {}
const char* getPluginName() const override { return "qkv_to_context_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new QkvToContextPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_;
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(QkvToContextPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -32,18 +32,14 @@ namespace plugin { ...@@ -32,18 +32,14 @@ namespace plugin {
int SkipLayerNormPluginDynamic::initialize() { int SkipLayerNormPluginDynamic::initialize() {
cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_); cudaMalloc(&bias_gpu_, sizeof(float) * bias_size_);
cudaMemcpy(bias_gpu_, bias_, bias_size_ * sizeof(float), cudaMemcpy(bias_gpu_, bias_.data(), bias_size_ * sizeof(float),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_); cudaMalloc(&scale_gpu_, sizeof(float) * scale_size_);
cudaMemcpy(scale_gpu_, scale_, scale_size_ * sizeof(float), cudaMemcpy(scale_gpu_, scale_.data(), scale_size_ * sizeof(float),
cudaMemcpyHostToDevice); cudaMemcpyHostToDevice);
return 0; return 0;
} }
size_t SkipLayerNormPluginDynamic::getSerializationSize() const { return 0; }
void SkipLayerNormPluginDynamic::serialize(void *buffer) const {}
nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions( nvinfer1::DimsExprs SkipLayerNormPluginDynamic::getOutputDimensions(
int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs,
nvinfer1::IExprBuilder &expr_builder) { nvinfer1::IExprBuilder &expr_builder) {
......
...@@ -29,61 +29,84 @@ namespace plugin { ...@@ -29,61 +29,84 @@ namespace plugin {
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
public: public:
explicit SkipLayerNormPluginDynamic(float* bias, float* scale, int bias_size, explicit SkipLayerNormPluginDynamic(const float* bias, const float* scale,
int scale_size, const float eps, int bias_size, int scale_size,
bool ban_fp16) const float eps, bool ban_fp16)
: bias_(bias), : bias_size_(bias_size),
scale_(scale),
bias_size_(bias_size),
scale_size_(scale_size), scale_size_(scale_size),
eps_(eps), eps_(eps),
ban_fp16_(ban_fp16) {} ban_fp16_(ban_fp16) {
SkipLayerNormPluginDynamic(void const* serialData, size_t serialLength) {} bias_.resize(bias_size);
scale_.resize(scale_size);
std::copy(bias, bias + bias_size, bias_.data());
std::copy(scale, scale + scale_size, scale_.data());
}
SkipLayerNormPluginDynamic(void const* serial_data, size_t serial_length) {
DeserializeValue(&serial_data, &serial_length, &bias_);
DeserializeValue(&serial_data, &serial_length, &scale_);
DeserializeValue(&serial_data, &serial_length, &bias_size_);
DeserializeValue(&serial_data, &serial_length, &scale_size_);
DeserializeValue(&serial_data, &serial_length, &eps_);
DeserializeValue(&serial_data, &serial_length, &ban_fp16_);
}
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SkipLayerNormPluginDynamic(bias_, scale_, bias_size_, return new SkipLayerNormPluginDynamic(
scale_size_, eps_, ban_fp16_); bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
} }
const char* getPluginType() const override { return "skip_layernorm_plugin"; } const char* getPluginType() const override { return "skip_layernorm_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
size_t getSerializationSize() const override; size_t getSerializationSize() const override {
void serialize(void* buffer) const override; size_t ser_size = SerializedSize(bias_) + SerializedSize(scale_) +
SerializedSize(bias_size_) + SerializedSize(scale_size_) +
SerializedSize(eps_) + SerializedSize(eps_);
return ser_size;
}
void serialize(void* buffer) const override {
SerializeValue(&buffer, bias_);
SerializeValue(&buffer, scale_);
SerializeValue(&buffer, bias_size_);
SerializeValue(&buffer, scale_size_);
SerializeValue(&buffer, eps_);
SerializeValue(&buffer, ban_fp16_);
}
nvinfer1::DimsExprs getOutputDimensions( nvinfer1::DimsExprs getOutputDimensions(
int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
nvinfer1::IExprBuilder& expr_builder) override; nvinfer1::IExprBuilder& expr_builder) override;
bool supportsFormatCombination(int pos, bool supportsFormatCombination(int pos,
const nvinfer1::PluginTensorDesc* inOut, const nvinfer1::PluginTensorDesc* in_out,
int nbInputs, int nbOutputs) override; int nb_inputs, int nb_outputs) override;
void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
int nbInputs, int nb_inputs,
const nvinfer1::DynamicPluginTensorDesc* out, const nvinfer1::DynamicPluginTensorDesc* out,
int nbOutputs) override {} int nb_outputs) override {}
size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
int nbInputs, int nb_inputs,
const nvinfer1::PluginTensorDesc* outputs, const nvinfer1::PluginTensorDesc* outputs,
int nbOutputs) const override { int nb_outputs) const override {
return 0; return 0;
} }
int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
const nvinfer1::PluginTensorDesc* outputDesc, const nvinfer1::PluginTensorDesc* output_desc,
const void* const* inputs, void* const* outputs, void* workspace, const void* const* inputs, void* const* outputs, void* workspace,
cudaStream_t stream) override; cudaStream_t stream) override;
nvinfer1::DataType getOutputDataType(int index, nvinfer1::DataType getOutputDataType(int index,
const nvinfer1::DataType* inputTypes, const nvinfer1::DataType* input_types,
int nbInputs) const override; int nb_inputs) const override;
void destroy() override { delete this; } void destroy() override { delete this; }
private: private:
float* bias_; std::vector<float> bias_;
float* scale_; std::vector<float> scale_;
float* bias_gpu_; float* bias_gpu_;
float* scale_gpu_; float* scale_gpu_;
...@@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -94,6 +117,45 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
float eps_; float eps_;
bool ban_fp16_; bool ban_fp16_;
}; };
class SkipLayerNormPluginV2Creator : public nvinfer1::IPluginCreator {
public:
SkipLayerNormPluginV2Creator() {}
const char* getPluginName() const override { return "skip_layernorm_plugin"; }
const char* getPluginVersion() const override { return "1"; }
const nvinfer1::PluginFieldCollection* getFieldNames() override {
return &field_collection_;
}
nvinfer1::IPluginV2* createPlugin(
const char* name, const nvinfer1::PluginFieldCollection* fc) override {
return nullptr;
}
nvinfer1::IPluginV2* deserializePlugin(const char* name,
const void* serial_data,
size_t serial_length) override {
auto plugin = new SkipLayerNormPluginDynamic(serial_data, serial_length);
return plugin;
}
void setPluginNamespace(const char* lib_namespace) override {
plugin_namespace_ = lib_namespace;
}
const char* getPluginNamespace() const override {
return plugin_namespace_.c_str();
}
private:
std::string plugin_namespace_;
std::string plugin_name_;
nvinfer1::PluginFieldCollection field_collection_;
std::vector<nvinfer1::PluginField> plugin_attributes_;
};
REGISTER_TRT_PLUGIN_V2(SkipLayerNormPluginV2Creator);
#endif #endif
} // namespace plugin } // namespace plugin
......
...@@ -175,11 +175,29 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt { ...@@ -175,11 +175,29 @@ class DynamicPluginTensorRT : public nvinfer1::IPluginV2DynamicExt {
void serializeBase(void*& buffer) const; // NOLINT void serializeBase(void*& buffer) const; // NOLINT
private: private:
std::string name_space_{"paddle_trt"}; std::string name_space_;
std::string plugin_base_{"plugin_dynamic"}; std::string plugin_base_;
}; };
#endif #endif
template <typename T>
class TrtPluginRegistrarV2 {
public:
TrtPluginRegistrarV2() {
static auto func_ptr = getPluginRegistry();
if (func_ptr != nullptr) {
func_ptr->registerCreator(creator, "");
}
}
private:
T creator;
};
#define REGISTER_TRT_PLUGIN_V2(name) \
static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
plugin_registrar_##name {}
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size, ...@@ -128,6 +128,12 @@ inline void DeserializeValue(void const** buffer, size_t* buffer_size,
return details::Serializer<T>::Deserialize(buffer, buffer_size, value); return details::Serializer<T>::Deserialize(buffer, buffer_size, value);
} }
template <typename T>
inline void SerializeCudaPointer(void** buffer, T* value, int size) {
cudaMemcpy((*buffer), value, size * sizeof(T), cudaMemcpyDeviceToHost);
reinterpret_cast<char*&>(*buffer) += size * sizeof(T);
}
} // namespace plugin } // namespace plugin
} // namespace tensorrt } // namespace tensorrt
} // namespace inference } // namespace inference
......
...@@ -420,6 +420,25 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -420,6 +420,25 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc inference_analysis_test(test_trt_dynamic_shape_ernie SRCS trt_dynamic_shape_ernie_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4) ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL})
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_model_4_serialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
inference_download_and_uncompress(${TRT_MODEL_INSTALL_DIR} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/ernie_model_4_serialized)
endif() endif()
set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
namespace paddle {
namespace inference {
void run(const AnalysisConfig& config, std::vector<float>* out_data) {
auto predictor = CreatePaddlePredictor(config);
auto input_names = predictor->GetInputNames();
int run_batch = 1;
const int run_seq_len = 128;
std::vector<int64_t> tmp_input;
std::vector<float> tmp_four_input;
tmp_input.reserve(run_batch * run_seq_len);
tmp_four_input.reserve(run_batch * run_seq_len);
int64_t i0[run_seq_len] = {
1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321,
4095, 1902, 4, 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2,
75, 201, 340, 9, 14, 44, 486, 218, 1140, 279, 12043, 2};
int64_t i1[run_seq_len] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
int64_t i2[run_seq_len] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
// first input
auto input_t = predictor->GetInputTensor(input_names[0]);
input_t->Reshape({run_batch, run_seq_len, 1});
input_t->copy_from_cpu(i0);
// second input
auto input_t2 = predictor->GetInputTensor(input_names[1]);
input_t2->Reshape({run_batch, run_seq_len, 1});
input_t2->copy_from_cpu(i1);
// third input.
auto input_t3 = predictor->GetInputTensor(input_names[2]);
input_t3->Reshape({run_batch, run_seq_len, 1});
input_t3->copy_from_cpu(i2);
auto input_t4 = predictor->GetInputTensor(input_names[3]);
input_t4->Reshape({run_batch, run_seq_len, 1});
input_t4->copy_from_cpu(i3);
ASSERT_TRUE(predictor->ZeroCopyRun());
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputTensor(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
out_data->resize(out_num);
output_t->copy_to_cpu(out_data->data());
}
void trt_ernie(bool with_fp16, std::vector<float> result) {
AnalysisConfig config;
std::string model_dir = FLAGS_infer_model;
SetConfig(&config, model_dir, true /* use_gpu */);
config.SwitchUseFeedFetchOps(false);
int head_number = 12;
int batch = 1;
int min_seq_len = 1;
int max_seq_len = 128;
int opt_seq_len = 128;
std::vector<int> min_shape = {batch, min_seq_len, 1};
std::vector<int> max_shape = {batch, max_seq_len, 1};
std::vector<int> opt_shape = {batch, opt_seq_len, 1};
// Set the input's min, max, opt shape
std::map<std::string, std::vector<int>> min_input_shape = {
{"read_file_0.tmp_0", min_shape},
{"read_file_0.tmp_1", min_shape},
{"read_file_0.tmp_2", min_shape},
{"stack_0.tmp_0", {batch, head_number, min_seq_len, min_seq_len}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{"read_file_0.tmp_0", max_shape},
{"read_file_0.tmp_1", max_shape},
{"read_file_0.tmp_2", max_shape},
{"stack_0.tmp_0", {batch, head_number, max_seq_len, max_seq_len}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{"read_file_0.tmp_0", opt_shape},
{"read_file_0.tmp_1", opt_shape},
{"read_file_0.tmp_2", opt_shape},
{"stack_0.tmp_0", {batch, head_number, opt_seq_len, opt_seq_len}}};
auto precision = AnalysisConfig::Precision::kFloat32;
if (with_fp16) {
precision = AnalysisConfig::Precision::kHalf;
}
config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
std::vector<float> out_data;
run(config, &out_data);
for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6);
}
}
TEST(AnalysisPredictor, no_fp16) {
std::vector<float> result = {0.597841, 0.219972, 0.182187};
trt_ernie(false, result);
}
TEST(AnalysisPredictor, fp16) {
#ifdef SUPPORTS_CUDA_FP16
std::vector<float> result = {0.598336, 0.219558, 0.182106};
trt_ernie(true, result);
#endif
}
} // namespace inference
} // namespace paddle
...@@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -120,7 +120,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
if (with_fp16) { if (with_fp16) {
precision = AnalysisConfig::Precision::kHalf; precision = AnalysisConfig::Precision::kHalf;
} }
config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, true); config.EnableTensorRtEngine(1 << 30, 1, 5, precision, false, false);
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape); opt_input_shape);
std::vector<float> out_data; std::vector<float> out_data;
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <algorithm>
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h"
......
...@@ -13,18 +13,62 @@ ...@@ -13,18 +13,62 @@
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/dynload/tensorrt.h"
#include <string>
namespace paddle { namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
std::once_flag tensorrt_dso_flag; std::once_flag tensorrt_dso_flag;
void *tensorrt_dso_handle; void* tensorrt_dso_handle;
#define DEFINE_WRAP(__name) DynLoad__##__name __name #define DEFINE_WRAP(__name) DynLoad__##__name __name
TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP); TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
void* GetTensorRtHandle() {
#if defined(__APPLE__) || defined(__OSX__)
std::string dso_name = "libnvinfer.dylib";
#elif defined(_WIN32)
std::string dso_name = "nvinfer.dll";
#else
std::string dso_name = "libnvinfer.so";
#endif
#if !defined(_WIN32)
int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
#else
int dynload_flags = 0;
#endif // !_WIN32
void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
if (nullptr == dso_handle) {
auto error_msg =
"TensorRT dynamic library (%s) that Paddle depends on is not "
"configured correctly. (error code is %s)\n"
" Suggestions:\n"
" 1. Check if TensorRT "
"is installed correctly and its version is matched with paddlepaddle "
"you installed.\n"
" 2. Configure TensorRT dynamic library environment variables as "
"follows:\n"
" - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
" - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
" - Mac: set DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...` "
"[Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is "
"impossible unless System Integrity Protection (SIP) is disabled.]";
#if !defined(_WIN32)
auto errorno = dlerror();
#else
auto errorno = GetLastError();
#endif // !_WIN32
std::cerr << string::Sprintf(error_msg, dso_name, errorno);
}
return dso_handle;
}
} // namespace dynload } // namespace dynload
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -27,42 +27,32 @@ namespace paddle { ...@@ -27,42 +27,32 @@ namespace paddle {
namespace platform { namespace platform {
namespace dynload { namespace dynload {
void* GetTensorRtHandle();
extern std::once_flag tensorrt_dso_flag; extern std::once_flag tensorrt_dso_flag;
extern void* tensorrt_dso_handle; extern void* tensorrt_dso_handle;
#ifdef PADDLE_USE_DSO #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
struct DynLoad__##__name { \
#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ template <typename... Args> \
struct DynLoad__##__name { \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \
template <typename... Args> \ using tensorrt_func = decltype(&::__name); \
auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ std::call_once(tensorrt_dso_flag, []() { \
using tensorrt_func = decltype(&::__name); \ tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \
std::call_once(tensorrt_dso_flag, []() { \ }); \
tensorrt_dso_handle = \ static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \
paddle::platform::dynload::GetTensorRtDsoHandle(); \ if (p_##__name == nullptr) { \
PADDLE_ENFORCE(tensorrt_dso_handle, "load tensorrt so failed"); \ return nullptr; \
}); \ } \
static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ return reinterpret_cast<tensorrt_func>(p_##__name)(args...); \
PADDLE_ENFORCE(p_##__name, "load %s failed", #__name); \ } \
return reinterpret_cast<tensorrt_func>(p_##__name)(args...); \ }; \
} \
}; \
extern DynLoad__##__name __name extern DynLoad__##__name __name
#else
#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
tensorrtResult_t operator()(Args... args) { \
return __name(args...); \
} \
}; \
extern DynLoad__##__name __name
#endif
#define TENSORRT_RAND_ROUTINE_EACH(__macro) \ #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
__macro(createInferBuilder_INTERNAL); \ __macro(createInferBuilder_INTERNAL); \
__macro(createInferRuntime_INTERNAL); __macro(createInferRuntime_INTERNAL); \
__macro(getPluginRegistry);
TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP) TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册