未验证 提交 8e163f92 编写于 作者: W Wilber 提交者: GitHub

[Inference Tensorrt] Add attr for trt engine and handle the input seq problem...

[Inference Tensorrt] Add attr for trt engine and handle the input seq problem for ernie var len. (#33575) (#33622)
上级 c3807f9e
...@@ -36,6 +36,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { ...@@ -36,6 +36,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
framework::OpDesc op_desc(op, nullptr); framework::OpDesc op_desc(op, nullptr);
auto word_id_name = op_desc.Input("WordId").front(); auto word_id_name = op_desc.Input("WordId").front();
auto pos_id_name = op_desc.Input("PosId").front(); auto pos_id_name = op_desc.Input("PosId").front();
engine_->Set("ernie_pos_name", new std::string(pos_id_name));
auto sent_id_name = op_desc.Input("SentId").front(); auto sent_id_name = op_desc.Input("SentId").front();
auto word_emb_name = op_desc.Input("WordEmbedding").front(); auto word_emb_name = op_desc.Input("WordEmbedding").front();
auto pos_emb_name = op_desc.Input("PosEmbedding").front(); auto pos_emb_name = op_desc.Input("PosEmbedding").front();
......
...@@ -191,9 +191,15 @@ class MultiheadMatMulOpConverter : public OpConverter { ...@@ -191,9 +191,15 @@ class MultiheadMatMulOpConverter : public OpConverter {
std::vector<nvinfer1::ITensor*> plugin_inputs; std::vector<nvinfer1::ITensor*> plugin_inputs;
plugin_inputs.emplace_back(fc_layer->getOutput(0)); plugin_inputs.emplace_back(fc_layer->getOutput(0));
plugin_inputs.emplace_back(mask_tensor); plugin_inputs.emplace_back(mask_tensor);
if (engine_->Has("ernie_pos_name")) {
plugin_inputs.emplace_back(
engine_->GetITensor(engine_->Get<std::string>("ernie_pos_name")));
} else {
plugin_inputs.emplace_back(engine_->GetITensor( plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()->getInput(2)->getName())); // cu_seqlens, engine_->network()
// eval_placeholder_2 ->getInput(2)
->getName())); // cu_seqlens, eval_placeholder_2
}
auto max_seqlen_tensor = auto max_seqlen_tensor =
engine_->GetITensor(engine_->network()->getInput(3)->getName()); engine_->GetITensor(engine_->network()->getInput(3)->getName());
auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
......
...@@ -76,9 +76,16 @@ class SliceOpConverter : public OpConverter { ...@@ -76,9 +76,16 @@ class SliceOpConverter : public OpConverter {
std::vector<nvinfer1::ITensor*> plugin_inputs; std::vector<nvinfer1::ITensor*> plugin_inputs;
// plugin_inputs.emplace_back(trans_layer->getOutput(0)); // plugin_inputs.emplace_back(trans_layer->getOutput(0));
plugin_inputs.emplace_back(input); plugin_inputs.emplace_back(input);
plugin_inputs.emplace_back(engine_->GetITensor(
engine_->network()->getInput(2)->getName())); // cu_seqlens, std::string pos_name;
// eval_placeholder_2 if (engine_->Has("ernie_pos_name")) {
pos_name = engine_->Get<std::string>("ernie_pos_name");
} else {
// hard code for compatibility
pos_name = engine_->network()->getInput(2)->getName();
}
plugin_inputs.emplace_back(
engine_->GetITensor(pos_name)); // cu_seqlens, eval_placeholder_2
// bool ban_fp16 = engine_->disable_trt_plugin_fp16(); // bool ban_fp16 = engine_->disable_trt_plugin_fp16();
plugin::SpecialSlicePluginDynamic* plugin = plugin::SpecialSlicePluginDynamic* plugin =
......
...@@ -202,7 +202,15 @@ class TensorRTEngine { ...@@ -202,7 +202,15 @@ class TensorRTEngine {
dy::initLibNvInferPlugins(&logger, ""); dy::initLibNvInferPlugins(&logger, "");
} }
~TensorRTEngine() {} ~TensorRTEngine() {
for (auto& attr : attrs_) {
if (attr_dels_.find(attr.first) != attr_dels_.end()) {
attr_dels_[attr.first]();
}
}
attrs_.clear();
attr_dels_.clear();
}
// Add an input and set its name, data type and dimension. // Add an input and set its name, data type and dimension.
nvinfer1::ITensor* DeclareInput(const std::string& name, nvinfer1::ITensor* DeclareInput(const std::string& name,
...@@ -386,6 +394,82 @@ class TensorRTEngine { ...@@ -386,6 +394,82 @@ class TensorRTEngine {
} }
#endif #endif
bool Has(const std::string& attr_name) const {
return attrs_.count(attr_name) > 0;
}
void Erase(const std::string& attr_name) {
if (!Has(attr_name)) {
return;
}
if (attr_dels_.find(attr_name) != attr_dels_.end()) {
attr_dels_[attr_name]();
attr_dels_.erase(attr_name);
}
attrs_.erase(attr_name);
}
// Set a pointer to the attribute. Engine takes ownership of the attribute.
template <typename AttrType>
void Set(const std::string& attr_name, AttrType* attr) {
if (attrs_.count(attr_name) == 0) {
PADDLE_ENFORCE_EQ(
attrs_.count(attr_name), 0,
platform::errors::AlreadyExists(
"Attribute %s already set in trt engine.", attr_name));
} else {
VLOG(3) << "Setting the attribute " << attr_name << " for trt engine "
<< this;
}
attrs_[attr_name] = attr;
attr_dels_[attr_name] = [attr, attr_name]() {
VLOG(3) << "deleting " << attr_name;
delete attr;
};
}
// Set a pointer to the attribute. Engine doesn't take ownership. Caller
// should delete the attribute.
template <typename AttrType>
void SetNotOwned(const std::string& attr_name, AttrType* attr) {
PADDLE_ENFORCE_EQ(
attrs_.count(attr_name), 0,
platform::errors::AlreadyExists(
"Attribute %s already set in trt engine.", attr_name));
attrs_[attr_name] = attr;
}
// Get a reference to the attributed previously set.
template <typename AttrType>
AttrType& Get(const std::string& attr_name) const {
PADDLE_ENFORCE_NE(attrs_.find(attr_name), attrs_.end(),
platform::errors::InvalidArgument(
"Attribute %s not found in trt engine.", attr_name));
try {
return *boost::any_cast<AttrType*>(attrs_.at(attr_name));
} catch (boost::bad_any_cast&) {
auto TypeToString = [](const std::type_info& info) -> std::string {
if (std::type_index(info) == std::type_index(typeid(bool*))) {
return "bool";
} else if (std::type_index(info) == std::type_index(typeid(int*))) {
return "int";
} else if (std::type_index(info) ==
std::type_index(typeid(const int*))) {
return "const int";
} else if (std::type_index(info) ==
std::type_index(typeid(std::string*))) {
return "std::string";
}
return info.name();
};
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid type for attritube %s, expected: %s, actual: %s.", attr_name,
TypeToString(typeid(AttrType*)),
TypeToString(attrs_.at(attr_name).type())));
}
}
private: private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated, // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling // ensure that the thread is associated with the correct device by calling
...@@ -441,6 +525,9 @@ class TensorRTEngine { ...@@ -441,6 +525,9 @@ class TensorRTEngine {
infer_ptr<nvinfer1::IHostMemory> ihost_memory_; infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_; std::unordered_map<nvinfer1::ITensor*, float> quant_dynamic_range_;
std::unordered_map<std::string, boost::any> attrs_;
std::unordered_map<std::string, std::function<void(void)>> attr_dels_;
// For dynamic shape // For dynamic shape
bool with_dynamic_shape_{false}; bool with_dynamic_shape_{false};
infer_ptr<nvinfer1::INetworkDefinition> infer_networkv2_; infer_ptr<nvinfer1::INetworkDefinition> infer_networkv2_;
......
...@@ -91,6 +91,15 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -91,6 +91,15 @@ TEST_F(TensorRTEngineTest, add_layer) {
buffers[0] = reinterpret_cast<void *>(x_v_gpu_data); buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
buffers[1] = reinterpret_cast<void *>(y_gpu_data); buffers[1] = reinterpret_cast<void *>(y_gpu_data);
LOG(INFO) << "Set attr";
engine_->Set("test_attr", new std::string("test_attr"));
if (engine_->Has("test_attr")) {
auto attr_val = engine_->Get<std::string>("test_attr");
engine_->Erase("test_attr");
}
std::string *attr_key = new std::string("attr_key");
engine_->SetNotOwned("attr1", attr_key);
LOG(INFO) << "to execute"; LOG(INFO) << "to execute";
engine_->Execute(1, &buffers, ctx_->stream()); engine_->Execute(1, &buffers, ctx_->stream());
...@@ -99,6 +108,8 @@ TEST_F(TensorRTEngineTest, add_layer) { ...@@ -99,6 +108,8 @@ TEST_F(TensorRTEngineTest, add_layer) {
LOG(INFO) << "to checkout output"; LOG(INFO) << "to checkout output";
ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
delete attr_key;
} }
TEST_F(TensorRTEngineTest, add_layer_multi_dim) { TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
......
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "gflags/gflags.h" #include "gflags/gflags.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tests/api/trt_test_helper.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h"
namespace paddle { namespace paddle {
...@@ -143,5 +144,136 @@ TEST(AnalysisPredictor, fp16) { ...@@ -143,5 +144,136 @@ TEST(AnalysisPredictor, fp16) {
#endif #endif
} }
// ernie_varlen
std::shared_ptr<paddle_infer::Predictor> InitPredictor() {
paddle_infer::Config config;
config.SetModel(FLAGS_infer_model);
config.EnableUseGpu(100, 0);
// Open the memory optim.
config.EnableMemoryOptim();
int max_batch = 32;
int max_single_seq_len = 128;
int opt_single_seq_len = 64;
int min_batch_seq_len = 1;
int max_batch_seq_len = 512;
int opt_batch_seq_len = 256;
std::string input_name0 = "read_file_0.tmp_0";
std::string input_name1 = "read_file_0.tmp_1";
std::string input_name2 = "read_file_0.tmp_2";
std::string input_name3 = "read_file_0.tmp_4";
std::vector<int> min_shape = {min_batch_seq_len};
std::vector<int> max_shape = {max_batch_seq_len};
std::vector<int> opt_shape = {opt_batch_seq_len};
// Set the input's min, max, opt shape
std::map<std::string, std::vector<int>> min_input_shape = {
{input_name0, min_shape},
{input_name1, min_shape},
{input_name2, {1}},
{input_name3, {1, 1, 1}}};
std::map<std::string, std::vector<int>> max_input_shape = {
{input_name0, max_shape},
{input_name1, max_shape},
{input_name2, {max_batch + 1}},
{input_name3, {1, max_single_seq_len, 1}}};
std::map<std::string, std::vector<int>> opt_input_shape = {
{input_name0, opt_shape},
{input_name1, opt_shape},
{input_name2, {max_batch + 1}},
{input_name3, {1, opt_single_seq_len, 1}}};
// only kHalf supported
config.EnableTensorRtEngine(
1 << 30, 1, 5, paddle_infer::Config::Precision::kHalf, false, false);
// erinie varlen must be used with dynamic shape
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape);
// erinie varlen must be used with oss
config.EnableTensorRtOSS();
return paddle_infer::CreatePredictor(config);
}
void run(paddle_infer::Predictor* predictor, std::vector<float>* out_data) {
const int run_batch = 2;
const int run_seq_len = 71;
const int max_seq_len = 128;
int32_t i1[run_seq_len] = {
// sentence 1
1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4,
134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44,
486, 218, 1140, 279, 12043, 2,
// sentence 2
101, 2054, 2234, 2046, 2486, 2044, 1996, 2047, 4552, 2001, 9536, 1029,
102, 2004, 1997, 2008, 2154, 1010, 1996, 2047, 4552, 9536, 2075, 1996,
2117, 3072, 2234, 2046, 2486, 1012, 102,
};
int32_t i2[run_seq_len] = {
// sentence 1
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// sentence 2
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1};
// shape info of this batch
int32_t i3[3] = {0, 40, 71};
// max_seq_len represents the max sentence length of all the sentences, only
// length of
// input i4 is useful, data means nothing.
int32_t i4[max_seq_len] = {0};
auto input_names = predictor->GetInputNames();
// first input
auto input_t1 = predictor->GetInputHandle(input_names[0]);
input_t1->Reshape({run_seq_len});
input_t1->CopyFromCpu(i1);
// second input
auto input_t2 = predictor->GetInputHandle(input_names[1]);
input_t2->Reshape({run_seq_len});
input_t2->CopyFromCpu(i2);
// third input
auto input_t3 = predictor->GetInputHandle(input_names[2]);
input_t3->Reshape({run_batch + 1});
input_t3->CopyFromCpu(i3);
// fourth input
auto input_t4 = predictor->GetInputHandle(input_names[3]);
input_t4->Reshape({1, max_seq_len, 1});
input_t4->CopyFromCpu(i4);
CHECK(predictor->Run());
auto output_names = predictor->GetOutputNames();
auto output_t = predictor->GetOutputHandle(output_names[0]);
std::vector<int> output_shape = output_t->shape();
int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
std::multiplies<int>());
out_data->resize(out_num);
output_t->CopyToCpu(out_data->data());
return;
}
TEST(AnalysisPredictor, ernie_varlen) {
#if IS_TRT_VERSION_GE(7234)
auto predictor = InitPredictor();
std::vector<float> out_data;
run(predictor.get(), &out_data);
std::vector<float> ref_data{0.59814, 0.219882, 0.181978,
0.359796, 0.577414, 0.0627908};
float near_tolerance = 1e-3;
for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(ref_data[i], out_data[i], near_tolerance);
}
#endif
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册