From 67bec55cbd42df12ab4f531c4126db1ec81a07d6 Mon Sep 17 00:00:00 2001 From: Wilber Date: Thu, 17 Jun 2021 12:54:14 +0800 Subject: [PATCH] [Inference Tensorrt] Add attr for trt engine and handle the input seq problem for ernie var len. (#33575) --- .../tensorrt/convert/emb_eltwise_layernorm.cc | 2 + .../tensorrt/convert/multihead_matmul_op.cc | 12 +- .../inference/tensorrt/convert/slice_op.cc | 13 +- paddle/fluid/inference/tensorrt/engine.h | 89 +++++++++++- .../fluid/inference/tensorrt/test_engine.cc | 11 ++ .../fluid/inference/tests/api/tester_helper.h | 1 + .../tests/api/trt_dynamic_shape_ernie_test.cc | 132 ++++++++++++++++++ 7 files changed, 253 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index 04c51202f02..18bbd1d2b77 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -36,6 +36,8 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); auto word_id_name = op_desc.Input("WordId").front(); auto pos_id_name = op_desc.Input("PosId").front(); + engine_->Set("ernie_pos_name", new std::string(pos_id_name)); + auto sent_id_name = op_desc.Input("SentId").front(); auto word_emb_name = op_desc.Input("WordEmbedding").front(); auto pos_emb_name = op_desc.Input("PosEmbedding").front(); diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc index f2f45c694ab..d05c9019a29 100644 --- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc @@ -191,9 +191,15 @@ class MultiheadMatMulOpConverter : public OpConverter { std::vector plugin_inputs; plugin_inputs.emplace_back(fc_layer->getOutput(0)); plugin_inputs.emplace_back(mask_tensor); - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(2)->getName())); // cu_seqlens, - // eval_placeholder_2 + if (engine_->Has("ernie_pos_name")) { + plugin_inputs.emplace_back( + engine_->GetITensor(engine_->Get("ernie_pos_name"))); + } else { + plugin_inputs.emplace_back(engine_->GetITensor( + engine_->network() + ->getInput(2) + ->getName())); // cu_seqlens, eval_placeholder_2 + } auto max_seqlen_tensor = engine_->GetITensor(engine_->network()->getInput(3)->getName()); auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index 2ab024dff32..7f270b1f390 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -76,9 +76,16 @@ class SliceOpConverter : public OpConverter { std::vector plugin_inputs; // plugin_inputs.emplace_back(trans_layer->getOutput(0)); plugin_inputs.emplace_back(input); - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(2)->getName())); // cu_seqlens, - // eval_placeholder_2 + + std::string pos_name; + if (engine_->Has("ernie_pos_name")) { + pos_name = engine_->Get("ernie_pos_name"); + } else { + // hard code for compatibility + pos_name = engine_->network()->getInput(2)->getName(); + } + plugin_inputs.emplace_back( + engine_->GetITensor(pos_name)); // cu_seqlens, eval_placeholder_2 // bool ban_fp16 = engine_->disable_trt_plugin_fp16(); plugin::SpecialSlicePluginDynamic* plugin = diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 2358e1ef976..7e570726978 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -202,7 +202,15 @@ class TensorRTEngine { dy::initLibNvInferPlugins(&logger, ""); } - ~TensorRTEngine() {} + ~TensorRTEngine() { + for (auto& attr : attrs_) { + if (attr_dels_.find(attr.first) != attr_dels_.end()) { + attr_dels_[attr.first](); + } + } + attrs_.clear(); + attr_dels_.clear(); + } // Add an input and set its name, data type and dimension. nvinfer1::ITensor* DeclareInput(const std::string& name, @@ -386,6 +394,82 @@ class TensorRTEngine { } #endif + bool Has(const std::string& attr_name) const { + return attrs_.count(attr_name) > 0; + } + + void Erase(const std::string& attr_name) { + if (!Has(attr_name)) { + return; + } + if (attr_dels_.find(attr_name) != attr_dels_.end()) { + attr_dels_[attr_name](); + attr_dels_.erase(attr_name); + } + attrs_.erase(attr_name); + } + + // Set a pointer to the attribute. Engine takes ownership of the attribute. + template + void Set(const std::string& attr_name, AttrType* attr) { + if (attrs_.count(attr_name) == 0) { + PADDLE_ENFORCE_EQ( + attrs_.count(attr_name), 0, + platform::errors::AlreadyExists( + "Attribute %s already set in trt engine.", attr_name)); + } else { + VLOG(3) << "Setting the attribute " << attr_name << " for trt engine " + << this; + } + attrs_[attr_name] = attr; + attr_dels_[attr_name] = [attr, attr_name]() { + VLOG(3) << "deleting " << attr_name; + delete attr; + }; + } + + // Set a pointer to the attribute. Engine doesn't take ownership. Caller + // should delete the attribute. + template + void SetNotOwned(const std::string& attr_name, AttrType* attr) { + PADDLE_ENFORCE_EQ( + attrs_.count(attr_name), 0, + platform::errors::AlreadyExists( + "Attribute %s already set in trt engine.", attr_name)); + attrs_[attr_name] = attr; + } + + // Get a reference to the attributed previously set. + template + AttrType& Get(const std::string& attr_name) const { + PADDLE_ENFORCE_NE(attrs_.find(attr_name), attrs_.end(), + platform::errors::InvalidArgument( + "Attribute %s not found in trt engine.", attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast&) { + auto TypeToString = [](const std::type_info& info) -> std::string { + if (std::type_index(info) == std::type_index(typeid(bool*))) { + return "bool"; + } else if (std::type_index(info) == std::type_index(typeid(int*))) { + return "int"; + } else if (std::type_index(info) == + std::type_index(typeid(const int*))) { + return "const int"; + } else if (std::type_index(info) == + std::type_index(typeid(std::string*))) { + return "std::string"; + } + return info.name(); + }; + + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid type for attritube %s, expected: %s, actual: %s.", attr_name, + TypeToString(typeid(AttrType*)), + TypeToString(attrs_.at(attr_name).type()))); + } + } + private: // Each ICudaEngine object is bound to a specific GPU when it is instantiated, // ensure that the thread is associated with the correct device by calling @@ -441,6 +525,9 @@ class TensorRTEngine { infer_ptr ihost_memory_; std::unordered_map quant_dynamic_range_; + std::unordered_map attrs_; + std::unordered_map> attr_dels_; + // For dynamic shape bool with_dynamic_shape_{false}; infer_ptr infer_networkv2_; diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 7c763858bb2..5c61bec55ba 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -91,6 +91,15 @@ TEST_F(TensorRTEngineTest, add_layer) { buffers[0] = reinterpret_cast(x_v_gpu_data); buffers[1] = reinterpret_cast(y_gpu_data); + LOG(INFO) << "Set attr"; + engine_->Set("test_attr", new std::string("test_attr")); + if (engine_->Has("test_attr")) { + auto attr_val = engine_->Get("test_attr"); + engine_->Erase("test_attr"); + } + std::string *attr_key = new std::string("attr_key"); + engine_->SetNotOwned("attr1", attr_key); + LOG(INFO) << "to execute"; engine_->Execute(1, &buffers, ctx_->stream()); @@ -99,6 +108,8 @@ TEST_F(TensorRTEngineTest, add_layer) { LOG(INFO) << "to checkout output"; ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); + + delete attr_key; } TEST_F(TensorRTEngineTest, add_layer_multi_dim) { diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 170b915ec74..dbc2acbed83 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -33,6 +33,7 @@ #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc index 6d69565716e..45dff9f4c37 100644 --- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc +++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "gflags/gflags.h" +#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tests/api/trt_test_helper.h" namespace paddle { @@ -143,5 +144,136 @@ TEST(AnalysisPredictor, fp16) { #endif } +// ernie_varlen +std::shared_ptr InitPredictor() { + paddle_infer::Config config; + config.SetModel(FLAGS_infer_model); + + config.EnableUseGpu(100, 0); + + // Open the memory optim. + config.EnableMemoryOptim(); + + int max_batch = 32; + int max_single_seq_len = 128; + int opt_single_seq_len = 64; + int min_batch_seq_len = 1; + int max_batch_seq_len = 512; + int opt_batch_seq_len = 256; + + std::string input_name0 = "read_file_0.tmp_0"; + std::string input_name1 = "read_file_0.tmp_1"; + std::string input_name2 = "read_file_0.tmp_2"; + std::string input_name3 = "read_file_0.tmp_4"; + + std::vector min_shape = {min_batch_seq_len}; + std::vector max_shape = {max_batch_seq_len}; + std::vector opt_shape = {opt_batch_seq_len}; + // Set the input's min, max, opt shape + std::map> min_input_shape = { + {input_name0, min_shape}, + {input_name1, min_shape}, + {input_name2, {1}}, + {input_name3, {1, 1, 1}}}; + std::map> max_input_shape = { + {input_name0, max_shape}, + {input_name1, max_shape}, + {input_name2, {max_batch + 1}}, + {input_name3, {1, max_single_seq_len, 1}}}; + std::map> opt_input_shape = { + {input_name0, opt_shape}, + {input_name1, opt_shape}, + {input_name2, {max_batch + 1}}, + {input_name3, {1, opt_single_seq_len, 1}}}; + + // only kHalf supported + config.EnableTensorRtEngine( + 1 << 30, 1, 5, paddle_infer::Config::Precision::kHalf, false, false); + // erinie varlen must be used with dynamic shape + config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, + opt_input_shape); + // erinie varlen must be used with oss + config.EnableTensorRtOSS(); + + return paddle_infer::CreatePredictor(config); +} + +void run(paddle_infer::Predictor* predictor, std::vector* out_data) { + const int run_batch = 2; + const int run_seq_len = 71; + const int max_seq_len = 128; + + int32_t i1[run_seq_len] = { + // sentence 1 + 1, 3558, 4, 75, 491, 89, 340, 313, 93, 4, 255, 10, 75, 321, 4095, 1902, 4, + 134, 49, 75, 311, 14, 44, 178, 543, 15, 12043, 2, 75, 201, 340, 9, 14, 44, + 486, 218, 1140, 279, 12043, 2, + // sentence 2 + 101, 2054, 2234, 2046, 2486, 2044, 1996, 2047, 4552, 2001, 9536, 1029, + 102, 2004, 1997, 2008, 2154, 1010, 1996, 2047, 4552, 9536, 2075, 1996, + 2117, 3072, 2234, 2046, 2486, 1012, 102, + }; + int32_t i2[run_seq_len] = { + // sentence 1 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + // sentence 2 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1}; + // shape info of this batch + int32_t i3[3] = {0, 40, 71}; + // max_seq_len represents the max sentence length of all the sentences, only + // length of + // input i4 is useful, data means nothing. + int32_t i4[max_seq_len] = {0}; + + auto input_names = predictor->GetInputNames(); + // first input + auto input_t1 = predictor->GetInputHandle(input_names[0]); + input_t1->Reshape({run_seq_len}); + input_t1->CopyFromCpu(i1); + + // second input + auto input_t2 = predictor->GetInputHandle(input_names[1]); + input_t2->Reshape({run_seq_len}); + input_t2->CopyFromCpu(i2); + + // third input + auto input_t3 = predictor->GetInputHandle(input_names[2]); + input_t3->Reshape({run_batch + 1}); + input_t3->CopyFromCpu(i3); + + // fourth input + auto input_t4 = predictor->GetInputHandle(input_names[3]); + input_t4->Reshape({1, max_seq_len, 1}); + input_t4->CopyFromCpu(i4); + + CHECK(predictor->Run()); + + auto output_names = predictor->GetOutputNames(); + auto output_t = predictor->GetOutputHandle(output_names[0]); + std::vector output_shape = output_t->shape(); + int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()); + out_data->resize(out_num); + output_t->CopyToCpu(out_data->data()); + + return; +} + +TEST(AnalysisPredictor, ernie_varlen) { +#if IS_TRT_VERSION_GE(7234) + auto predictor = InitPredictor(); + std::vector out_data; + run(predictor.get(), &out_data); + std::vector ref_data{0.59814, 0.219882, 0.181978, + 0.359796, 0.577414, 0.0627908}; + float near_tolerance = 1e-3; + for (size_t i = 0; i < out_data.size(); i++) { + EXPECT_NEAR(ref_data[i], out_data[i], near_tolerance); + } +#endif +} + } // namespace inference } // namespace paddle -- GitLab