diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index 9155a7c300aade0c19f03f2afed85d00b785e3a6..6ce37c39e6c02acfcfcf6b6566e8ecca121041fd 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -90,10 +90,12 @@ class AnakinOpConverter { for (int i = 0; i < var_shape.size(); i++) { input_shape.push_back(var_shape[i]); } - input_shape[0] = 1; + input_shape[0] = engine->GetMaxBatch(); engine->SetInputShape(input, input_shape); } + + // engine->Graph()->RegistAllOut(); engine->Optimize(); engine->InitGraph(); } diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc index 822627b8a26fa978a46d145f6ef94d32e8636965..b8b0d06d2106010772b0b9d4d307fd2744ce00a2 100644 --- a/paddle/fluid/inference/anakin/engine.cc +++ b/paddle/fluid/inference/anakin/engine.cc @@ -34,10 +34,12 @@ namespace anakin { template AnakinEngine::AnakinEngine(bool need_summary, - int device) + int device, + int max_batch_size) : graph_(new AnakinGraphT()), net_(new AnakinNetT(need_summary)) { device_ = device; + max_batch_size_ = max_batch_size; } template @@ -71,8 +73,8 @@ void AnakinEngine::Execute( for (const auto &input : inputs) { auto *tensor = input.second; auto *data = tensor->data(); - auto fluid_input_shape = framework::vectorize2int(tensor->dims()); + auto fluid_input_shape = framework::vectorize2int(tensor->dims()); auto *anakin_input = net_->get_in(input.first); auto net_shape = anakin_input->shape(); if (tensor->numel() > net_shape.count()) { @@ -84,11 +86,13 @@ void AnakinEngine::Execute( anakin_input->reshape(fluid_input_shape); net_shape = anakin_input->shape(); + ::anakin::saber::Tensor tmp_anakin_tensor(data, TargetT(), 0, - net_shape); - anakin_input->share_from(tmp_anakin_tensor); + // net_shape); + fluid_input_shape); + anakin_input->copy_from(tmp_anakin_tensor); } - + cudaDeviceSynchronize(); net_->prediction(); for (const auto &output : outputs) { platform::CUDAPlace gpu_place(device_); @@ -98,12 +102,10 @@ void AnakinEngine::Execute( auto anakin_output_shape = anakin_output->valid_shape(); tensor->Resize(framework::make_ddim(anakin_output_shape)); auto *fluid_data = tensor->mutable_data(gpu_place); - memory::Copy(gpu_place, static_cast(fluid_data), gpu_place, static_cast(anakin_data), tensor->numel() * sizeof(float), stream); } - cudaDeviceSynchronize(); } diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h index 2613fc7f9b50eeb973129e72fe8017a7614b4d94..101ca491678a54ce09fd9a5aa81d63eaede46304 100644 --- a/paddle/fluid/inference/anakin/engine.h +++ b/paddle/fluid/inference/anakin/engine.h @@ -55,7 +55,8 @@ class AnakinEngine { using GraphT = ::anakin::graph::Graph; public: - explicit AnakinEngine(bool need_summary = false, int device = 0); + explicit AnakinEngine(bool need_summary = false, int device = 0, + int max_batch_size = 1); ~AnakinEngine(); void InitGraph(); void SetInputShape(const std::string &name, std::vector shape); @@ -70,10 +71,12 @@ class AnakinEngine { "Add operation's attribution."); } NetT *Net() { return net_.get(); } + GraphT *Graph() { return graph_.get(); } std::unique_ptr Clone(); void Freeze(); void Optimize(); void Save(std::string path) { graph_->save(path); } + int GetMaxBatch() { return max_batch_size_; } // void SaveSerializedData(std::string& data) { graph_->save_to_string(data); // } // void LoadSerializedData(const std::string& data) { @@ -83,6 +86,7 @@ class AnakinEngine { cudaStream_t stream); private: + int max_batch_size_; int device_; std::unique_ptr graph_; std::unique_ptr net_; @@ -100,10 +104,11 @@ class AnakinEngineManager { return engines_.at(name).get(); } - AnakinNvEngineT *Create(bool need_summary, int device, + AnakinNvEngineT *Create(bool need_summary, int device, int max_batch_size, std::string engine_name) { std::unique_lock lk(mut_); - auto *p = new AnakinEngine(need_summary, device); + auto *p = new AnakinEngine(need_summary, device, + max_batch_size); engines_[engine_name].reset(p); return p; } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 43452906974e02d3a622157a124d669999bdcca6..87aceba4793265189b5b35e76443b5ca1a6809aa 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -150,6 +150,7 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, bool); + DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int); DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool); // Memory optimized related. diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 1327d69d909ffff01b3b661f9b08baa24de878bd..3dc9c347b5f0bf4340f056f350e1ab38f5160a28 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -77,6 +77,7 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("engine_opt_info", new std::map( argument->engine_opt_info())); pass->Set("predictor_id", new int(argument->predictor_id())); + pass->Set("max_batch_size", new int(argument->anakin_max_batch_size())); } if (pass_name == "tensorrt_subgraph_pass") { @@ -91,16 +92,20 @@ void IRPassManager::CreatePasses(Argument *argument, AnalysisConfig::Precision::kInt8; pass->Set("enable_int8", new bool(enable_int8)); - std::string model_opt_cache_dir = - argument->Has("model_dir") - ? argument->model_dir() - : GetDirRoot(argument->model_program_path()); - pass->Set( - "model_opt_cache_dir", - new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + + bool use_static_engine = argument->tensorrt_use_static_engine(); + bool model_from_memory = argument->model_from_memory(); + if ((!model_from_memory && use_static_engine)) { + std::string model_opt_cache_dir = + argument->Has("model_dir") + ? argument->model_dir() + : GetDirRoot(argument->model_program_path()); + pass->Set( + "model_opt_cache_dir", + new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + } pass->Set("gpu_device_id", new int(argument->gpu_device_id())); - pass->Set("use_static_engine", - new bool(argument->tensorrt_use_static_engine())); + pass->Set("use_static_engine", new bool(use_static_engine)); pass->Set("model_from_memory", new bool(argument->model_from_memory())); pass->Set("engine_opt_info", new std::map( argument->engine_opt_info())); diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index 4b21bfe6bcf2139d6d0e015111a89991277dba2a..b2bd1ec0ea1143c971d4e9ace784e645c96926bf 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -256,10 +256,11 @@ void AnakinSubgraphPass::CreateAnakinOp( input_names_with_id, output_names_with_id, std::to_string(predictor_id)); SetAttr(op_desc->Proto(), "engine_key", engine_key); + int max_batch_size = Get("max_batch_size"); auto *anakin_engine = inference::Singleton::Global().Create( - true, Get("gpu_device_id"), engine_key); + true, Get("gpu_device_id"), max_batch_size, engine_key); auto *scope = param_scope(); std::unordered_set param_set(params.begin(), params.end()); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 7407883d6cc36831bbb84777e88308b79d1c1a9e..1800f06f2de2ac7f8bd6b10b4c079ec75f13b67a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -245,8 +245,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp( trt_engine_serialized_data.empty()) { std::copy(params.begin(), params.end(), std::back_inserter(*repetitive_params)); - trt_engine_serialized_data = GetTrtEngineSerializedData( - Get("model_opt_cache_dir"), engine_key); + + if (use_static_engine && !load_from_memory) { + trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); + } if (trt_engine_serialized_data.empty()) { LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " @@ -267,10 +270,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp( trt_engine_serialized_data = std::string((const char *)serialized_engine_data->data(), serialized_engine_data->size()); - SaveTrtEngineSerializedDataToFile( - GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), - engine_key), - trt_engine_serialized_data); + + if (use_static_engine && !load_from_memory) { + SaveTrtEngineSerializedDataToFile( + GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + engine_key), + trt_engine_serialized_data); + } } else { LOG(INFO) << "Load TRT Optimized Info from " << GetTrtEngineSerializedPath( diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 59e8f48313cff5bd532186348872e8d53ce11cb3..3c17f49fa350929e4c92c470c62a2dab6b6a92da 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -109,6 +109,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); + CP_MEMBER(use_anakin_); + CP_MEMBER(anakin_max_batchsize_); + // Ir related. CP_MEMBER(enable_ir_optim_); CP_MEMBER(use_feed_fetch_ops_); @@ -352,7 +355,8 @@ void AnalysisConfig::SwitchIrDebug(int x) { ir_debug_ = x; Update(); } -void AnalysisConfig::EnableAnakinEngine() { +void AnalysisConfig::EnableAnakinEngine(int max_batch_size) { + anakin_max_batchsize_ = max_batch_size; use_anakin_ = true; Update(); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index fa6c6f500dc935f0912e6df053157259d82d6806..9c992602e0a82d816a69b369f4de6d4370896a33 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -379,6 +379,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } if (config_.use_gpu() && config_.anakin_engine_enabled()) { + argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_); LOG(INFO) << "Anakin subgraph engine is enabled"; } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 80f256513e1538403083dce4710589dce1fd32af..65dd669c95fc50d08af04a1a48fcf44f111373d3 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -145,7 +145,7 @@ struct AnalysisConfig { /** * \brief Turn on the usage of Anakin sub-graph engine. */ - void EnableAnakinEngine(); + void EnableAnakinEngine(int max_batch_size = 1); /** A boolean state indicating whether the Anakin sub-graph engine is used. */ @@ -270,6 +270,7 @@ struct AnalysisConfig { mutable std::unique_ptr pass_builder_; bool use_anakin_{false}; + int anakin_max_batchsize_; std::map engine_opt_info_; };