diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index f5d240f2c621798662e7ff3053de8c75bc3708c8..77c9a94df2f2c003c743276187a8b34979491c61 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1527,6 +1527,16 @@ PDNode *patterns::AnakinDetectionPattern::operator()( ->assert_is_op_output("box_coder") ->AsIntermediate(); + auto transpose_before_nms = + pattern->NewNode(GetNodeName("transpose_before_nms")) + ->assert_is_op("transpose2"); + + auto transpose_before_nms_out = + pattern->NewNode(GetNodeName("transpose_before_nms_out")) + ->assert_is_op_output("transpose2") + ->assert_is_op_input("multiclass_nms", "Scores") + ->AsIntermediate(); + auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms")) ->assert_is_op("multiclass_nms") ->assert_op_has_n_inputs("multiclass_nms", 2); @@ -1577,8 +1587,10 @@ PDNode *patterns::AnakinDetectionPattern::operator()( {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]}); box_coder_out->LinksFrom({box_coder_op}); - multiclass_nms_op - ->LinksFrom({box_coder_out, conv_in[kMultiClassSecondInputNmsOffset]}) + transpose_before_nms->LinksFrom({conv_in[kMultiClassSecondInputNmsOffset]}); + transpose_before_nms_out->LinksFrom({transpose_before_nms}); + + multiclass_nms_op->LinksFrom({box_coder_out, transpose_before_nms_out}) .LinksTo({multiclass_nms_out}); return multiclass_nms_out; diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc index 462f077eb7a829b12ff321f7d2dc26b644d540f8..5ab10ba39fa86a9a65c247177cd437ba48d3797d 100644 --- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc @@ -45,7 +45,7 @@ std::unique_ptr SimplifyAnakinDetectionPatternPass::ApplyImpl( input_nodes.push_back(gpd.mutable_pattern() ->NewNode("x" + std::to_string(times + 1)) - ->assert_is_op_input("multiclass_nms", "Scores") + ->assert_is_op_input("transpose2") ->AsInput()); patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name); @@ -106,6 +106,11 @@ std::unique_ptr SimplifyAnakinDetectionPatternPass::ApplyImpl( Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out")); Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]); + Node *transpose_before_nms = + subgraph.at(pattern.GetPDNode("transpose_before_nms")); + Node *transpose_before_nms_out = + subgraph.at(pattern.GetPDNode("transpose_before_nms_out")); + Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms")); Node *multiclass_nms_out = subgraph.at(pattern.GetPDNode("multiclass_nms_out")); @@ -133,11 +138,11 @@ std::unique_ptr SimplifyAnakinDetectionPatternPass::ApplyImpl( nodes[i * kNumFields + kPriorBoxLocOffset]->Name()); } - int axis = boost::get(concat_op1->Op()->GetAttr("axis")); + // int axis = boost::get(concat_op1->Op()->GetAttr("axis")); framework::OpDesc concat1_desc; concat1_desc.SetType("concat"); concat1_desc.SetInput("X", concat1_input_names); - concat1_desc.SetAttr("axis", axis); + concat1_desc.SetAttr("axis", 2); concat1_desc.SetOutput("Out", {concat_out1->Name()}); auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc); @@ -184,6 +189,8 @@ std::unique_ptr SimplifyAnakinDetectionPatternPass::ApplyImpl( delete_nodes.insert(concat_out2); delete_nodes.insert(box_coder_op); delete_nodes.insert(box_coder_out); + delete_nodes.insert(transpose_before_nms); + delete_nodes.insert(transpose_before_nms_out); delete_nodes.insert(multiclass_nms); new_add_concat_op->outputs.push_back(concat_out1); diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt index 2da280ed29b478319b5c1da34d2f85b9e1d2e20e..1646c7d16ec3d99fd82d824e9ea7d9819fa62253 100644 --- a/paddle/fluid/inference/anakin/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(anakin_engine SRCS engine.cc) -nv_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto) +cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto) target_link_libraries(anakin_engine anakin anakin_saber_common) cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine) add_subdirectory(convert) diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc index ebe81dabcbc71f764a7487c7882a71054a76a20d..94014802bdbe1792e9eaba28d7134624dd3edc90 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.cc +++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc @@ -43,11 +43,13 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, auto output = op_desc.Output("Y").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front(); auto epsilon = boost::get(op_desc.GetAttr("epsilon")); + // auto momentum = boost::get(op_desc.GetAttr("momentum")); auto bn_op_name = op_name + ":bn"; auto bn_output = bn_op_name + "_output"; engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output}); engine_->AddOpAttr(bn_op_name, "epsilon", epsilon); + engine_->AddOpAttr(bn_op_name, "momentum", static_cast(1.0)); auto scale_op_name = op_name + ":scale"; auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name, diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc index 565a95f17b12fc36b266228b7c71fff6127fd14f..a55c153f99a815c0e0092b69b8e181630aed16bf 100644 --- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc @@ -27,8 +27,8 @@ namespace paddle { namespace inference { namespace anakin { -void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op, - const framework::Scope &scope, +void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) { framework::OpDesc op_desc(op, nullptr); auto input_name = op_desc.Input("Input").front(); @@ -42,34 +42,45 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op, auto fixed_ratios = boost::get>(op_desc.GetAttr("fixed_ratios")); auto densities = boost::get>(op_desc.GetAttr("densities")); + std::vector dens; + for (auto& ele : densities) { + dens.push_back(static_cast(ele)); + } // lack flip - auto clip = boost::get(op_desc.GetAttr("clip")); + // auto clip = boost::get(op_desc.GetAttr("clip")); auto variances = boost::get>(op_desc.GetAttr("variances")); + for (auto& ele : variances) { + LOG(INFO) << ele; + } // lack img_h, img_w auto step_h = boost::get(op_desc.GetAttr("step_h")); auto step_w = boost::get(op_desc.GetAttr("step_w")); auto offset = boost::get(op_desc.GetAttr("offset")); - std::vector order = {"MIN", "COM", "MAX"}; + PTuple t_order; + t_order.push_back("MIN"); + t_order.push_back("COM"); + t_order.push_back("MAX"); + std::vector temp_v = {}; engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name}); engine_->AddOpAttr>(op_name, "min_size", temp_v); engine_->AddOpAttr>(op_name, "max_size", temp_v); engine_->AddOpAttr>(op_name, "aspect_ratio", temp_v); - engine_->AddOpAttr>(op_name, "fixed_sizes", fixed_sizes); - engine_->AddOpAttr>(op_name, "fixed_ratios", fixed_ratios); - engine_->AddOpAttr>(op_name, "density", densities); - engine_->AddOpAttr(op_name, "is_flip", false); - engine_->AddOpAttr(op_name, "is_clip", clip); + engine_->AddOpAttr>(op_name, "fixed_size", fixed_sizes); + engine_->AddOpAttr>(op_name, "fixed_ratio", fixed_ratios); + engine_->AddOpAttr>(op_name, "density", dens); + engine_->AddOpAttr(op_name, "is_flip", static_cast(false)); + engine_->AddOpAttr(op_name, "is_clip", static_cast(false)); engine_->AddOpAttr>(op_name, "variance", variances); engine_->AddOpAttr(op_name, "img_h", static_cast(0)); engine_->AddOpAttr(op_name, "img_w", static_cast(0)); engine_->AddOpAttr(op_name, "step_h", step_h); engine_->AddOpAttr(op_name, "step_w", step_w); engine_->AddOpAttr(op_name, "offset", offset); - engine_->AddOpAttr>(op_name, "order", order); + engine_->AddOpAttr>(op_name, "order", t_order); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index eb75f1f62ab92f42e8faad0e19c5a3d900e2f065..9155a7c300aade0c19f03f2afed85d00b785e3a6 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -18,6 +18,7 @@ #include #include #include +#include #include "framework/core/types.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" @@ -68,6 +69,35 @@ class AnakinOpConverter { ConvertOp(op, parameters, scope, engine); } } + + // The scope here should be inited with the parameter vars. + void ConvertBlockToAnakinEngine( + framework::BlockDesc *block_desc, const framework::Scope &scope, + const std::vector &inputs, + const std::unordered_set ¶meters, + const std::vector &outputs, AnakinNvEngine *engine) { + framework::proto::BlockDesc *block_proto = block_desc->Proto(); + ConvertBlock(*block_proto, parameters, scope, engine); + engine->Freeze(); + for (auto &input : inputs) { + if (parameters.count(input)) continue; + auto *var = block_desc->FindVar(input); + PADDLE_ENFORCE(var, "no variable called %s", input); + + auto var_shape = var->GetShape(); + PADDLE_ENFORCE(var_shape.size() == 4); + std::vector input_shape; + for (int i = 0; i < var_shape.size(); i++) { + input_shape.push_back(var_shape[i]); + } + input_shape[0] = 1; + + engine->SetInputShape(input, input_shape); + } + engine->Optimize(); + engine->InitGraph(); + } + void SetEngine(AnakinNvEngine *engine) { engine_ = engine; } virtual ~AnakinOpConverter() {} diff --git a/paddle/fluid/inference/anakin/convert/pool2d.cc b/paddle/fluid/inference/anakin/convert/pool2d.cc index 1b251149f3eccb419339a604af43c8641906e952..3dee589ac47d5c43ccb588e46563035c5a7c933a 100644 --- a/paddle/fluid/inference/anakin/convert/pool2d.cc +++ b/paddle/fluid/inference/anakin/convert/pool2d.cc @@ -55,7 +55,7 @@ void Pool2dOpConverter::operator()(const framework::proto::OpDesc &op, if (pool_type == "max") { anakin_pool_type = "MAX"; } else if (pool_type == "avg") { - anakin_pool_type = "AVG"; + anakin_pool_type = "AVGEXC"; } else { PADDLE_THROW("TensorRT unsupported pooling type!"); } diff --git a/paddle/fluid/inference/anakin/convert/softmax.cc b/paddle/fluid/inference/anakin/convert/softmax.cc index fd791ef98bc7041371439a4a72d643ca37dc29d8..d5cd8908ebf623f0334a3b4df2b19147c63f77a3 100644 --- a/paddle/fluid/inference/anakin/convert/softmax.cc +++ b/paddle/fluid/inference/anakin/convert/softmax.cc @@ -33,7 +33,7 @@ void SoftMaxOpConverter::operator()(const framework::proto::OpDesc &op, auto output = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); engine_->AddOp(op_name, "Softmax", {input}, {output}); - engine_->AddOpAttr(op_name, "axis", 1); + engine_->AddOpAttr(op_name, "axis", 2); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc index c6eebf5d0c425a4184fcf92c8608267a412a2272..2e438dd7241e3c865404806d41a01fc9c4c33bc7 100644 --- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc @@ -52,8 +52,9 @@ TEST(batch_norm_op, test) { desc.SetOutput("SavedVariance", {"batch_norm_save_variance"}); float eps = 1e-5f; + bool is_test = true; desc.SetAttr("epsilon", eps); - desc.SetAttr("is_test", true); + desc.SetAttr("is_test", is_test); validator.SetOp(*desc.Proto()); diff --git a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc index bb47c70a07129d53a960df7175c42f8c498da632..95cb4194957894dea1524ba8ed0a44973870f31b 100644 --- a/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_pool2d_op.cc @@ -64,11 +64,52 @@ void test_pool2d(bool global_pooling, bool ceil_mode, validator.Execute(1); } +void test_pool2d2(bool global_pooling, bool ceil_mode, + std::string pool_type = "max") { + auto* pool2d_converter = + Registry::Global().Lookup("pool2d"); + ASSERT_TRUE(pool2d_converter); + + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator(parameters, scope); + + // The ITensor's Dims should not contain the batch size. + // So, the ITensor's Dims of input and output should be C * H * W. + validator.DeclInputVar("pool2d_x", {1, 1, 17, 17}); + validator.DeclOutputVar("pool2d_out", {1, 1, 17, 17}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pool2d"); + desc.SetInput("X", {"pool2d_x"}); + desc.SetOutput("Out", {"pool2d_out"}); + + std::vector ksize({3, 3}); + std::vector strides({1, 1}); + std::vector paddings({1, 1}); + std::string pooling_t = pool_type; + + desc.SetAttr("pooling_type", pooling_t); + desc.SetAttr("ksize", ksize); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + desc.SetAttr("global_pooling", global_pooling); + desc.SetAttr("ceil_mode", true); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(1); +} + TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); } TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); } TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); } TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); } +TEST(Pool2dOpConverter, avg_ceil_test2) { test_pool2d2(false, true, "avg"); } } // namespace anakin } // namespace inference diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index 815a88fb4898c636801d8a44cc27c1ee6a71091f..1b0ef8c7dbe5fd8d39c42cc39972159126ec1214 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -168,7 +168,7 @@ class AnakinConvertValidation { outputs.insert({output, tensor}); } - engine_->Execute(inputs, outputs); + engine_->Execute(inputs, outputs, stream_); int i_output = 0; for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index fb95bd03e07470973c59ee752d7f6187ad2a19db..822627b8a26fa978a46d145f6ef94d32e8636965 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -33,9 +33,12 @@ namespace inference { namespace anakin { template -AnakinEngine::AnakinEngine(bool need_summary) +AnakinEngine::AnakinEngine(bool need_summary, + int device) : graph_(new AnakinGraphT()), - net_(new AnakinNetT(need_summary)) {} + net_(new AnakinNetT(need_summary)) { + device_ = device; +} template AnakinEngine::~AnakinEngine() {} @@ -63,33 +66,44 @@ void AnakinEngine::AddOp( template void AnakinEngine::Execute( const std::map &inputs, - const std::map &outputs) { + const std::map &outputs, + cudaStream_t stream) { for (const auto &input : inputs) { auto *tensor = input.second; auto *data = tensor->data(); - auto shape = framework::vectorize2int(tensor->dims()); + auto fluid_input_shape = framework::vectorize2int(tensor->dims()); + auto *anakin_input = net_->get_in(input.first); - auto anakin_input_shape = anakin_input->valid_shape(); - PADDLE_ENFORCE(tensor->numel(), anakin_input_shape.count(), - "the fluid input size should be equal to anakin"); + auto net_shape = anakin_input->shape(); + if (tensor->numel() > net_shape.count()) { + graph_->Reshape(input.first, fluid_input_shape); + net_.reset(new AnakinNetT(true)); + net_->init(*graph_); + anakin_input = net_->get_in(input.first); + } + + anakin_input->reshape(fluid_input_shape); + net_shape = anakin_input->shape(); ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, - anakin_input_shape); - anakin_input->copy_from(tmp_anakin_tensor); + net_shape); + anakin_input->share_from(tmp_anakin_tensor); } + net_->prediction(); for (const auto &output : outputs) { + platform::CUDAPlace gpu_place(device_); auto *tensor = output.second; - auto *data = tensor->data(); - auto shape = framework::vectorize2int(tensor->dims()); auto *anakin_output = net_->get_out(output.first); + auto *anakin_data = anakin_output->data(); auto anakin_output_shape = anakin_output->valid_shape(); - PADDLE_ENFORCE(tensor->numel(), anakin_output_shape.count(), - "the fluid output size should be equal to anakin"); - ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, - anakin_output_shape); - anakin_output->share_from(tmp_anakin_tensor); + tensor->Resize(framework::make_ddim(anakin_output_shape)); + auto *fluid_data = tensor->mutable_data(gpu_place); + + memory::Copy(gpu_place, static_cast(fluid_data), gpu_place, + static_cast(anakin_data), + tensor->numel() * sizeof(float), stream); } - net_->prediction(); + cudaDeviceSynchronize(); } diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h index 408ad3b9f62f08cef0e3faa967aa1e71c2829408..2613fc7f9b50eeb973129e72fe8017a7614b4d94 100644 --- a/paddle/fluid/inference/anakin/engine.h +++ b/paddle/fluid/inference/anakin/engine.h @@ -18,6 +18,7 @@ #include #include #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/engine.h" @@ -26,8 +27,12 @@ #include "framework/core/net/net.h" #include "framework/core/types.h" #include "framework/graph/graph.h" +#include "framework/graph/graph_global_mem.h" #include "saber/saber_types.h" +using anakin::Precision; +using anakin::saber::NV; + namespace anakin { template @@ -50,7 +55,7 @@ class AnakinEngine { using GraphT = ::anakin::graph::Graph; public: - explicit AnakinEngine(bool need_summary = false); + explicit AnakinEngine(bool need_summary = false, int device = 0); ~AnakinEngine(); void InitGraph(); void SetInputShape(const std::string &name, std::vector shape); @@ -69,14 +74,50 @@ class AnakinEngine { void Freeze(); void Optimize(); void Save(std::string path) { graph_->save(path); } + // void SaveSerializedData(std::string& data) { graph_->save_to_string(data); + // } + // void LoadSerializedData(const std::string& data) { + // graph_->load_from_string(data); } void Execute(const std::map &inputs, - const std::map &outputs); + const std::map &outputs, + cudaStream_t stream); private: + int device_; std::unique_ptr graph_; std::unique_ptr net_; }; +class AnakinEngineManager { + using AnakinNvEngineT = AnakinEngine; + + public: + bool HasEngine(const std::string &name) const { + if (engines_.count(name) == 0) return false; + return engines_.at(name).get() != nullptr; + } + AnakinNvEngineT *Get(const std::string &name) const { + return engines_.at(name).get(); + } + + AnakinNvEngineT *Create(bool need_summary, int device, + std::string engine_name) { + std::unique_lock lk(mut_); + auto *p = new AnakinEngine(need_summary, device); + engines_[engine_name].reset(p); + return p; + } + + void DeleteALL() { + for (auto &item : engines_) { + item.second.reset(nullptr); + } + } + + private: + std::unordered_map> engines_; + std::mutex mut_; +}; } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/test_anakin_engine.cc b/paddle/fluid/inference/anakin/test_anakin_engine.cc index 571294d3e22fb9489686bfcb2f3a64198099f970..8fd6b8bec9ada6dec67fd24a2457713203431ebf 100644 --- a/paddle/fluid/inference/anakin/test_anakin_engine.cc +++ b/paddle/fluid/inference/anakin/test_anakin_engine.cc @@ -17,9 +17,6 @@ limitations under the License. */ #include -#include "framework/core/net/net.h" -#include "framework/graph/graph.h" -#include "framework/graph/graph_global_mem.h" #include "paddle/fluid/inference/anakin/engine.h" using anakin::graph::GraphGlobalMem; @@ -84,7 +81,9 @@ TEST_F(TestAnakinEngine, Execute) { auto *y_data = y.mutable_data(platform::CUDAPlace()); std::map outputs = {{"y", &y}}; - engine_->Execute(inputs, outputs); + cudaStream_t stream; + + engine_->Execute(inputs, outputs, stream); auto *y_data_gpu = y_data; float y_data_cpu[2]; cudaMemcpy(y_data_cpu, y_data_gpu, sizeof(float) * 2, cudaMemcpyDeviceToHost); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 321deccf86718aad013c106b5a783161f96cbcb9..43452906974e02d3a622157a124d669999bdcca6 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -23,6 +23,7 @@ #pragma once +#include #include #include #include @@ -55,6 +56,7 @@ struct Argument { using unique_ptr_t = std::unique_ptr>; using fusion_statis_t = std::unordered_map; + using engine_opt_info_t = std::map; bool Has(const std::string& key) const { return valid_fields_.count(key); } @@ -107,12 +109,14 @@ struct Argument { private: \ unique_ptr_t field__##_; + DECL_ARGUMENT_FIELD(predictor_id, PredictorID, int); // Model path DECL_ARGUMENT_FIELD(model_dir, ModelDir, std::string); // Model specified with program and parameters files. DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); + DECL_ARGUMENT_FIELD(engine_opt_info, EngineOptInfo, engine_opt_info_t); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); @@ -146,6 +150,8 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, bool); + DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); + // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 07423a148553d7fd3cc0f0c2568460c4228fc096..1327d69d909ffff01b3b661f9b08baa24de878bd 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/ir_pass_manager.h" +#include #include #include #include @@ -71,6 +72,11 @@ void IRPassManager::CreatePasses(Argument *argument, if (pass_name == "anakin_subgraph_pass") { pass->Set("program", new framework::ProgramDesc *(&argument->main_program())); + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + pass->Set("model_from_memory", new bool(argument->model_from_memory())); + pass->Set("engine_opt_info", new std::map( + argument->engine_opt_info())); + pass->Set("predictor_id", new int(argument->predictor_id())); } if (pass_name == "tensorrt_subgraph_pass") { @@ -95,6 +101,9 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("gpu_device_id", new int(argument->gpu_device_id())); pass->Set("use_static_engine", new bool(argument->tensorrt_use_static_engine())); + pass->Set("model_from_memory", new bool(argument->model_from_memory())); + pass->Set("engine_opt_info", new std::map( + argument->engine_opt_info())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index 1a4c24e800bc830c9e195a800049970b9fa40634..4b21bfe6bcf2139d6d0e015111a89991277dba2a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -21,6 +21,7 @@ #include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/inference/anakin/convert/op_converter.h" #include "paddle/fluid/inference/anakin/op_teller.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h" @@ -45,12 +46,20 @@ std::unique_ptr analysis::AnakinSubgraphPass::ApplyImpl( return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); }; - SubGraphFuser fuser(graph.get(), teller, 3 /* min_subgraph_size */); + SubGraphFuser fuser(graph.get(), teller, 0 /* min_subgraph_size */); fuser(); + std::vector graph_param_names = + ExtractAnakinParameters(graph->Nodes()); + + // those parameter already exist in anakin, and should not have another copy + // in + // fluid. + std::vector repetitive_params; + for (auto *node : graph->Nodes()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) { - CreateAnakinOp(node, graph.get()); + CreateAnakinOp(node, graph.get(), graph_param_names, &repetitive_params); std::unordered_set nodes2remove( Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); @@ -64,13 +73,15 @@ std::unique_ptr analysis::AnakinSubgraphPass::ApplyImpl( } } framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + graph->Set(framework::ir::kRepetitiveParamAttr, + new std::vector(repetitive_params)); return graph; } -std::string GenerateAnakinEngineKey( - const std::set &engine_inputs, - const std::set &engine_outputs) { +std::string GenerateAnakinEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs, + std::string id) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; @@ -78,12 +89,15 @@ std::string GenerateAnakinEngineKey( for (auto name : engine_outputs) { engine_hash_key += name; } + engine_hash_key += id; auto engine_key = std::to_string(std::hash()(engine_hash_key)); return engine_key; } -void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node, - Graph *graph) const { +void AnakinSubgraphPass::CreateAnakinOp( + framework::ir::Node *node, Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { auto *op_desc = node->Op(); auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -117,10 +131,16 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node, // is unique. std::set input_names; std::set input_names_with_id; + std::vector params; for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } } + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); op_desc->SetInput( "Xs", std::vector(input_names.begin(), input_names.end())); @@ -231,10 +251,25 @@ void AnakinSubgraphPass::CreateAnakinOp(framework::ir::Node *node, SetAttr(op_desc->Proto(), "parameters", ExtractAnakinParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); - auto engine_key = - GenerateAnakinEngineKey(input_names_with_id, output_names_with_id); + int predictor_id = Get("predictor_id"); + auto engine_key = GenerateAnakinEngineKey( + input_names_with_id, output_names_with_id, std::to_string(predictor_id)); SetAttr(op_desc->Proto(), "engine_key", engine_key); + + auto *anakin_engine = + inference::Singleton::Global().Create( + true, Get("gpu_device_id"), engine_key); + + auto *scope = param_scope(); + std::unordered_set param_set(params.begin(), params.end()); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + + inference::Singleton::Global() + .ConvertBlockToAnakinEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, anakin_engine); } std::vector ExtractAnakinParameters( @@ -246,7 +281,7 @@ std::vector ExtractAnakinParameters( for (const auto &node : nodes) { if (!node->IsOp()) continue; std::string op_type = node->Op()->Type(); - if (op_type == "feed") { + if (op_type == "feed" || op_type == "fetch") { std::vector output_names = node->Op()->OutputArgumentNames(); std::copy(output_names.begin(), output_names.end(), std::back_inserter(feed_outputs)); diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h index 5f45ff579d51ca3626a926e9d7e6a38e1d7225e6..a732cb55921397c18f65b9c679b6f3213a1a7190 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h @@ -15,8 +15,13 @@ #pragma once #include #include +#include +#include #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/anakin/engine.h" +using anakin::Precision; +using anakin::saber::NV; namespace paddle { namespace inference { namespace analysis { @@ -27,8 +32,9 @@ class AnakinSubgraphPass : public framework::ir::FusePassBase { std::unique_ptr graph) const override; private: - void CreateAnakinOp(framework::ir::Node *x, - framework::ir::Graph *graph) const; + void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; void CleanIntermediateOutputs(framework::ir::Node *node); }; diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index d4e2da8957f2057b21460d00b71e9717c63ed054..7407883d6cc36831bbb84777e88308b79d1c1a9e 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include +#include #include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" @@ -219,7 +220,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp( SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); - SetAttr(op_desc->Proto(), "engine_serialized_data", std::string("")); + bool load_from_memory = Get("model_from_memory"); + std::string trt_engine_serialized_data = ""; + if (load_from_memory) { + std::map engine_opt_info = + Get>("engine_opt_info"); + if (engine_opt_info.count(engine_key)) { + trt_engine_serialized_data = engine_opt_info[engine_key]; + } + } + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); std::unique_ptr calibrator; if (enable_int8 && calibration_data.size() != 0) { @@ -230,10 +241,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp( // When in int8 mode and calibration_mode, the program just produce the // calibration table data. bool calibration_mode = (enable_int8 && calibration_data.size() == 0); - if (!calibration_mode && use_static_engine) { + if (!calibration_mode && use_static_engine && + trt_engine_serialized_data.empty()) { std::copy(params.begin(), params.end(), std::back_inserter(*repetitive_params)); - std::string trt_engine_serialized_data = GetTrtEngineSerializedData( + trt_engine_serialized_data = GetTrtEngineSerializedData( Get("model_opt_cache_dir"), engine_key); if (trt_engine_serialized_data.empty()) { diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 29d21dc171c6d7ed756cb644675baac1d4cedd43..38313754ea98dcf4b65098d3966c131a3fc98518 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -64,8 +64,3 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI anakin_target(inference_anakin_api) anakin_target(inference_anakin_api_shared) endif() -if (WITH_ANAKIN_SUBGRAPH) - inference_analysis_test(test_anakin_model SRCS mobilenet_test.cc EXTRA_DEPS paddle_fluid) - inference_analysis_test(anakin_conv_model SRCS conv_anakin_test.cc EXTRA_DEPS paddle_fluid) - inference_analysis_test(life_feature_test SRCS life_feature_test.cc EXTRA_DEPS paddle_fluid) -endif() diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 1be25de497346913f24eec147a2db58b0f7065f4..59e8f48313cff5bd532186348872e8d53ce11cb3 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/platform/gpu_info.h" namespace paddle { +extern const std::vector kAnakinSubgraphPasses; PassStrategy *AnalysisConfig::pass_builder() const { if (!pass_builder_.get()) { @@ -230,6 +231,20 @@ void AnalysisConfig::Update() { } } + if (use_anakin_) { + PADDLE_ENFORCE(!use_tensorrt_, + "Anakin sub-graph and TensorRT sub-graph are not allowed to " + "run at the same time!"); + PADDLE_ENFORCE( + use_gpu_, + "Anakin sub-graph engine need gpu, please use the EnableGpu API."); + + pass_builder()->ClearPasses(); + for (const auto &pass : kAnakinSubgraphPasses) { + pass_builder()->AppendPass(pass); + } + } + if (ir_debug_) { pass_builder()->TurnOnDebug(); } @@ -266,7 +281,7 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << specify_input_name_; ss << cpu_math_library_num_threads_; - + ss << use_anakin_; return ss.str(); } @@ -316,6 +331,11 @@ void AnalysisConfig::SetModelBuffer(const char *prog_buffer, Update(); } +void AnalysisConfig::SetEngineOptInfo( + std::map engine_opt_info) { + engine_opt_info_ = engine_opt_info; +} + NativeConfig AnalysisConfig::ToNativeConfig() const { NativeConfig config; config.model_dir = model_dir_; @@ -332,5 +352,8 @@ void AnalysisConfig::SwitchIrDebug(int x) { ir_debug_ = x; Update(); } - +void AnalysisConfig::EnableAnakinEngine() { + use_anakin_ = true; + Update(); +} } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index e2bb446d00c90319df89d4f54fae8eefe5bc13ea..fa6c6f500dc935f0912e6df053157259d82d6806 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -351,7 +351,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetStaticMemoryOptimForceUpdate( config_.static_memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); + argument_.SetEngineOptInfo(config_.engine_opt_info_); // Analyze inference_program + argument_.SetUseAnakin(config_.anakin_engine_enabled()); + argument_.SetPredictorID(predictor_id_); if (!config_.model_dir().empty()) { argument_.SetModelDir(config_.model_dir()); } else { @@ -375,6 +378,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); } + if (config_.use_gpu() && config_.anakin_engine_enabled()) { + LOG(INFO) << "Anakin subgraph engine is enabled"; + } + if (config_.use_mkldnn_) { LOG(INFO) << "MKLDNN is enabled"; argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); @@ -404,7 +411,7 @@ std::unique_ptr CreatePaddlePredictor< VLOG(3) << "create AnalysisConfig"; if (config.use_gpu()) { // 1. GPU memory - PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); + PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f); PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", config.gpu_device_id()); std::vector flags; diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 5c0535d63e00c32ef82aa6d804459542d7da3e50..087bfbd002658da28e3097a10933d5be2312ab4f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -45,7 +45,9 @@ using framework::NaiveExecutor; */ class AnalysisPredictor : public PaddlePredictor { public: - explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) {} + explicit AnalysisPredictor(const AnalysisConfig &config) : config_(config) { + predictor_id_ = inference::GetUniqueId(); + } ~AnalysisPredictor(); bool Init(const std::shared_ptr &parent_scope, @@ -152,6 +154,7 @@ class AnalysisPredictor : public PaddlePredictor { const size_t max_shape_collect_count_{1000}; int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true. std::vector>> batch_var_shapes_; + int predictor_id_; private: // Some status here that help to determine the status inside the predictor. diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 9b05c335047d7f9a0c50004e4ff6817ddd53d80f..80f256513e1538403083dce4710589dce1fd32af 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -14,9 +14,11 @@ #pragma once #include +#include #include #include #include +#include #include /*! \file */ @@ -140,6 +142,14 @@ struct AnalysisConfig { /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } + /** + * \brief Turn on the usage of Anakin sub-graph engine. + */ + void EnableAnakinEngine(); + + /** A boolean state indicating whether the Anakin sub-graph engine is used. + */ + bool anakin_engine_enabled() const { return use_anakin_; } /** \brief Control whether to debug IR graph analysis phase. * @@ -185,6 +195,7 @@ struct AnalysisConfig { /** A boolean state telling whether the model is set from the CPU memory. */ bool model_from_memory() const { return model_from_memory_; } + void SetEngineOptInfo(std::map engine_opt_info); /** Turn on memory optimize * NOTE still in development, will release latter. @@ -258,6 +269,8 @@ struct AnalysisConfig { std::string serialized_info_cache_; mutable std::unique_ptr pass_builder_; + bool use_anakin_{false}; + std::map engine_opt_info_; }; } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 22c527cfc117a5e6ababf264744745e41e0bf71a..f6d82a57d2939ddd37fad682d0190668ff15e3d5 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -68,6 +68,17 @@ void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } +// The following passes works for Anakin sub-graph engine. +const std::vector kAnakinSubgraphPasses({ + "infer_clean_graph_pass", // + "simplify_anakin_detection_pattern_pass3", // + "fc_fuse_pass", // + "conv_elementwise_add_fuse_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_fuse_pass", // + "anakin_subgraph_pass", +}); + GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ "infer_clean_graph_pass", // @@ -120,4 +131,5 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { }); use_gpu_ = false; } +void PaddlePassBuilder::ClearPasses() { passes_.clear(); } } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 2524d89fcd1322e105ad2217347aa2380448f2bc..c93aec1a2253a45a2d0599b7b866e20ae025a99d 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -45,6 +45,7 @@ class PaddlePassBuilder { /** Delete all the passes that has type `pass_type`. */ void DeletePass(const std::string &pass_type); + void ClearPasses(); /** Append an analysis pass. */ void AppendAnalysisPass(const std::string &pass); @@ -142,4 +143,6 @@ class GpuPassStrategy : public PassStrategy { virtual ~GpuPassStrategy() = default; }; +extern const std::vector kAnakinSubgraphPasses; + } // namespace paddle diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h index 93569367ffd08d0d72dbad2dc7561812d7727d95..7a70836652db2ce9774660d3853bde37666eed71 100644 --- a/paddle/fluid/operators/anakin/anakin_engine_op.h +++ b/paddle/fluid/operators/anakin/anakin_engine_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA +#include #include #include #include @@ -52,8 +53,9 @@ class AnakinEngineOp : public framework::OperatorBase { private: std::vector input_names_; std::unordered_set param_names_; - mutable std::unique_ptr anakin_engine_; + mutable AnakinNvEngineT *anakin_engine_; std::string engine_key_; + std::string engine_serialized_data_; public: AnakinEngineOp(const std::string &type, @@ -67,6 +69,7 @@ class AnakinEngineOp : public framework::OperatorBase { for (const auto ¶m : params) { param_names_.insert(param); } + anakin_engine_ = nullptr; } protected: @@ -77,12 +80,12 @@ class AnakinEngineOp : public framework::OperatorBase { void RunAnakin(const framework::Scope &scope, const platform::Place &dev_place) const { - if (anakin_engine_.get() == nullptr) { - anakin_engine_.reset(new AnakinEngine(true)); - Prepare(scope, dev_place, anakin_engine_.get()); - } + auto *engine = GetEngine(scope, dev_place); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto stream = + reinterpret_cast(dev_ctx).stream(); - auto *engine = anakin_engine_.get(); PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = @@ -95,24 +98,48 @@ class AnakinEngineOp : public framework::OperatorBase { auto &t = inference::analysis::GetFromScope(scope, x); auto t_shape = framework::vectorize(t.dims()); + auto *anakin_input = engine->Net()->get_in(x); + auto net_shape = anakin_input->shape(); + size_t anakin_net_input_size = net_shape.count() * sizeof(float); + size_t fluid_input_size = t.memory_size(); + + if (fluid_input_size < anakin_net_input_size) { + framework::LoDTensor temp_t; + auto t_dims = t.dims(); + temp_t.Resize(t_dims); + TensorCopySync(t, dev_place, &temp_t); + t.Resize(framework::make_ddim(net_shape)); + t.mutable_data(dev_place); + TensorCopySync(temp_t, dev_place, &t); + } inputs.insert({x, &t}); } std::map outputs; int output_index = 0; for (const auto &y : Outputs("Ys")) { - std::vector ddim = - engine->Net()->get_out(output_maps[output_index])->valid_shape(); + // std::vector ddim = + // engine->Net()->get_out(output_maps[output_index])->valid_shape(); // we need get the output anakin output shape. auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto *fluid_t = fluid_v->GetMutable(); - fluid_t->Resize(framework::make_ddim(ddim)); - fluid_t->mutable_data(boost::get(dev_place)); + // fluid_t->Resize(framework::make_ddim(ddim)); + // fluid_t->mutable_data(boost::get(dev_place)); outputs.insert({output_maps[output_index], fluid_t}); output_index += 1; } - engine->Execute(inputs, outputs); + engine->Execute(inputs, outputs, stream); + } + + AnakinNvEngineT *GetEngine(const framework::Scope &scope, + const platform::Place &dev_place) const { + if (anakin_engine_ == nullptr) { + anakin_engine_ = + inference::Singleton::Global() + .Get(engine_key_); + } + return anakin_engine_; } void Prepare(const framework::Scope &scope, const platform::Place &dev_place, @@ -128,8 +155,6 @@ class AnakinEngineOp : public framework::OperatorBase { inference::Singleton::Global() .ConvertBlock(block_desc, param_names_, scope, engine); engine->Freeze(); - engine->Optimize(); - for (const auto &x : Inputs("Xs")) { if (param_names_.count(x)) continue; auto &t = @@ -142,6 +167,9 @@ class AnakinEngineOp : public framework::OperatorBase { } engine->SetInputShape(x, t_shape); } + + engine->Optimize(); + engine->InitGraph(); } };