diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 88ce61f9b928aba1945bddc1f9f6b785834780ca..71c4a54dea08d9d5e53f182949854981fe36a41a 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -133,7 +133,9 @@ struct Argument { // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); - DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool); + DECL_ARGUMENT_FIELD(static_memory_optim, StaticMemoryOptim, bool); + DECL_ARGUMENT_FIELD(static_memory_optim_force_update, + StaticMemoryOptimForceUpdate, bool); // Indicate which kind of sort algorithm is used for operators, the memory // optimization relays on the sort algorithm. DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc index 57683c0b727ef1c922e3a308db28d0af4f193602..3d1be9196fdeacd8ff852dbb595473a687352ccf 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -444,6 +444,26 @@ std::vector>> DeseralizeBatchVarShapes( return batch_shapes; } +// Replace the -1 in shape to a real number to fake the shape. +std::vector>> FakeBatchVarShapes( + const framework::ProgramDesc& program) { + std::vector>> res; + res.emplace_back(); + auto& record = res.front(); + const int fake_batch_size = 3; + for (auto* var : program.Block(0).AllVars()) { + if (var->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + auto shape = var->GetShape(); + for (auto& v : shape) { + if (v < 0) v = fake_batch_size; + } + record[var->Name()].assign(shape.begin(), shape.end()); + } + } + return res; +} + // Calculate the average dim of each tensor from the batch shape cache. std::unordered_map GetBatchAverageSize( const std::vector>>& batches) { @@ -478,6 +498,7 @@ std::vector> AnalysisBatchShapesByBatchSize( std::unordered_map var_batchsize_hashes; for (auto& batch : batches) { for (auto& ele : batch) { + PADDLE_ENFORCE(!ele.second.empty()); int batch_size = ele.second.front(); // TODO(Superjomn) might consume large memory here, use combine hash. var_batchsize_hashes[ele.first] << batch_size; @@ -538,9 +559,21 @@ std::vector> AnalysisBatchShapesBySimilarSize( std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } +std::pair GetRange( + const std::unordered_map& ave_size) { + auto res = std::make_pair(std::numeric_limits::max(), + std::numeric_limits::min()); + for (auto& item : ave_size) { + res.first = std::min(item.second, res.first); + res.second = std::max(item.second, res.second); + } + return res; +} + void MemoryOptimizePass::RunImpl(Argument* argument) { // When force update, should not optimize memory. - if (!argument->enable_memory_optim() || argument->memory_optim_force_update()) + if (!argument->enable_memory_optim() || + argument->static_memory_optim_force_update()) return; graph_ = argument->main_graph_ptr(); @@ -549,21 +582,38 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { argument->model_program_path_valid() ? argument->model_program_path() : ""); VLOG(3) << "Load memory cache from " << path; - if (inference::IsFileExists(path)) { - VLOG(4) << "Performing memory optimize"; - auto batches = DeseralizeBatchVarShapes(path); - auto var_batch_ave_size = GetBatchAverageSize(batches); + std::vector>> batches; + + if (argument->static_memory_optim() && inference::IsFileExists(path)) { + string::PrettyLogInfo("--- Performing static memory optimize"); + batches = DeseralizeBatchVarShapes(path); + } else { + string::PrettyLogInfo("--- Performing dynamic memory optimize"); + batches = FakeBatchVarShapes(argument->main_program()); + } + auto var_batch_ave_size = GetBatchAverageSize(batches); + + // Get min and max memory size. + const auto range = GetRange(var_batch_ave_size); + const int cluster_size = std::max( + static_cast((range.second - range.first) / 100 /*cluster num*/), + 1024); + const int cluster_size1 = std::max( + static_cast((range.second - range.first) / 1000 /*cluster num*/), + 1024); - std::unordered_map tensor_nodes; - space_table_t space_table; - CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); + std::unordered_map tensor_nodes; + space_table_t space_table; + CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); - std::unordered_map reuse_table; - double max_saving_ratio = 0.; + std::unordered_map reuse_table; + double max_saving_ratio = 0.; - std::vector> strategies; + std::vector> strategies; - for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + if (argument->static_memory_optim()) { + // This strategy only make scene in static memory optimize. strategies.emplace_back([&, sort_kind] { auto clustered_vars_by_batch_size = AnalysisBatchShapesByBatchSize(batches); @@ -572,71 +622,67 @@ void MemoryOptimizePass::RunImpl(Argument* argument) { space_table, &reuse_table, sort_kind, &allocation); return allocation; }); + } - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, 1024); // interval 1kb - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = + AnalysisBatchShapesBySimilarSize(space_table, batches, cluster_size1); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, + std::numeric_limits::max()); // no intervals + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, space_table, + &reuse_table, sort_kind, &allocation); + return allocation; + }); + } - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, 1024 * 1024); // interval 1MB - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + std::function* best_strategy{nullptr}; - strategies.emplace_back([&, sort_kind] { - auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( - space_table, batches, - std::numeric_limits::max()); // no intervals - MemoryAllocation allocation; - MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, - space_table, &reuse_table, sort_kind, &allocation); - return allocation; - }); + // Try all strategies to get the best result. + for (auto& strategy : strategies) { + auto allocation = strategy(); + string::PrettyLogDetail("--- get strategy saving %f memory for workspace", + allocation.GetSavingRatio()); + if (allocation.GetSavingRatio() > max_saving_ratio) { + max_saving_ratio = allocation.GetSavingRatio(); + best_strategy = &strategy; } + } + if (!best_strategy) { + LOG(ERROR) << "This model makes poor memory optimize, skip memory optimize"; + return; + } + auto memory_allocation = (*best_strategy)(); - std::function* best_strategy{nullptr}; + string::PrettyLogInfo( + "--- Saved %.2f%s memory for workspace(temporary variables)", + memory_allocation.GetSavingRatio() * 100, "%"); - // Try all strategies to get the best result. - for (auto& strategy : strategies) { - auto allocation = strategy(); - string::PrettyLogDetail("--- get strategy saving %f memory for workspace", - allocation.GetSavingRatio()); - if (allocation.GetSavingRatio() > max_saving_ratio) { - max_saving_ratio = allocation.GetSavingRatio(); - best_strategy = &strategy; - } - } - if (!best_strategy) { - LOG(ERROR) - << "This model makes poor memory optimize, skip memory optimize"; - return; - } - auto memory_allocation = (*best_strategy)(); - - string::PrettyLogH2( - "--- Saved %.2f%s memory for workspace(temporary variables)", - memory_allocation.GetSavingRatio() * 100, "%"); - string::PrettyLogDetail("--- Allocated %d MB", - memory_allocation.allocated / 1024. / 1024.); - string::PrettyLogDetail("--- Saved %d MB", - memory_allocation.saved / 1024. / 1024.); - argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, - new std::unordered_set); - auto& vars2remove = - argument->main_graph().Get>( - framework::ir::kGraphToProgramVarsToRemove); - - PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); - argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); - } + argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, + new std::unordered_set); + auto& vars2remove = + argument->main_graph().Get>( + framework::ir::kGraphToProgramVarsToRemove); + + PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); + argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); } float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h index fa1ad9c8c6aeff60ec4468f41140c57be790af7f..216f416de0d1003b944337ee98fb4e6a22c66fc5 100644 --- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index f9da3004ed8306ef08144d096afa4f86133e492d..e6008ba335ed89222247fc00033d1afbd6b28f16 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -95,7 +95,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(enable_memory_optim_); - CP_MEMBER(memory_optim_force_update_); + CP_MEMBER(static_memory_optim_); + CP_MEMBER(static_memory_optim_force_update_); // TensorRT releated. CP_MEMBER(use_tensorrt_); CP_MEMBER(tensorrt_workspace_size_); @@ -238,7 +239,8 @@ std::string contrib::AnalysisConfig::SerializeInfoCache() { ss << tensorrt_min_subgraph_size_; ss << enable_memory_optim_; - ss << memory_optim_force_update_; + ss << static_memory_optim_; + ss << static_memory_optim_force_update_; ss << use_mkldnn_; for (auto &item : mkldnn_enabled_op_types_) ss << item; @@ -278,9 +280,11 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #endif } -void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) { +void contrib::AnalysisConfig::EnableMemoryOptim( + bool static_optim, bool force_update_static_cache) { enable_memory_optim_ = true; - memory_optim_force_update_ = force_update_cache; + static_memory_optim_ = static_optim; + static_memory_optim_force_update_ = force_update_static_cache; Update(); } @@ -300,4 +304,16 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, Update(); } +NativeConfig contrib::AnalysisConfig::ToNativeConfig() const { + NativeConfig config; + config.model_dir = model_dir_; + config.prog_file = prog_file_; + config.param_file = params_file_; + config.use_gpu = use_gpu_; + config.device = device_id_; + config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); + config.specify_input_name = specify_input_name_; + return config; +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 2b0cad5faa0e31cb7546d405e05e36754915f653..9f8a78f7abc37d17b9806ea766da132f9bf4b28d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -298,15 +298,15 @@ void AnalysisPredictor::GetFetchOne(const framework::LoDTensor &fetch, bool AnalysisPredictor::GetFetch(std::vector *outputs, framework::Scope *scope) { VLOG(3) << "Predictor::get_fetch"; - outputs->resize(fetchs_.size()); - for (size_t i = 0; i < fetchs_.size(); ++i) { - int idx = boost::get(fetchs_[i]->GetAttr("col")); + outputs->resize(fetches_.size()); + for (size_t i = 0; i < fetches_.size(); ++i) { + int idx = boost::get(fetches_[i]->GetAttr("col")); PADDLE_ENFORCE((size_t)idx == i); framework::LoDTensor &fetch = framework::GetFetchVariable(*scope, "fetch", idx); auto type = fetch.type(); auto output = &(outputs->at(i)); - output->name = fetchs_[idx]->Input("X")[0]; + output->name = fetches_[idx]->Input("X")[0]; if (type == framework::proto::VarType::FP32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; @@ -327,7 +327,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); - argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); + argument_.SetStaticMemoryOptim(config_.static_memory_optim_); + argument_.SetStaticMemoryOptimForceUpdate( + config_.static_memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir().empty()) { @@ -422,10 +424,10 @@ void AnalysisPredictor::PrepareFeedFetch() { feed_names_[op->Output("Out")[0]] = idx; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); - if (fetchs_.size() <= static_cast(idx)) { - fetchs_.resize(idx + 1); + if (fetches_.size() <= static_cast(idx)) { + fetches_.resize(idx + 1); } - fetchs_[idx] = op; + fetches_[idx] = op; } } } @@ -638,12 +640,12 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { // check if the cache exists if (!config_.enable_memory_optim()) { need = false; - } else if (config_.enable_memory_optim() && + } else if (config_.static_memory_optim_ && !inference::IsFileExists(inference::analysis::GetMemoryCachePath( config_.model_dir(), config_.prog_file()))) { need = true; - } else if (config_.enable_memory_optim() && - config_.memory_optim_force_update_) { + } else if (config_.static_memory_optim_ && + config_.static_memory_optim_force_update_) { need = true; } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 9095b6ec1af6794c19e94fc9326a48239b3ba145..a8ea67d4bd332b5614f4f6593e8397829d28c5a6 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -115,7 +115,7 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr inference_program_; std::vector feeds_; std::map feed_names_; - std::vector fetchs_; + std::vector fetches_; // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 1cee8904500636d7b49e6b4e54595dbce6a79954..f89eaeaadcc50fd7979d6807e8f2c7556e048e6c 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -162,17 +162,7 @@ struct AnalysisConfig { /** Transform the AnalysisConfig to NativeConfig. */ - NativeConfig ToNativeConfig() const { - NativeConfig config; - config.model_dir = model_dir_; - config.prog_file = prog_file_; - config.param_file = params_file_; - config.use_gpu = use_gpu_; - config.device = device_id_; - config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); - config.specify_input_name = specify_input_name_; - return config; - } + NativeConfig ToNativeConfig() const; /** Specify the operator type list to use MKLDNN acceleration. * @param op_list the operator type list. */ @@ -195,7 +185,8 @@ struct AnalysisConfig { /** Turn on memory optimize * NOTE still in development, will release latter. */ - void EnableMemoryOptim(bool force_update_cache = false); + void EnableMemoryOptim(bool static_optim = false, + bool force_update_static_cache = false); /** Tell whether the memory optimization is activated. */ bool enable_memory_optim() const; @@ -241,7 +232,8 @@ struct AnalysisConfig { // memory reuse related. bool enable_memory_optim_{false}; - bool memory_optim_force_update_{false}; + bool static_memory_optim_{false}; + bool static_memory_optim_force_update_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 4ec9404ab42bcd9cc0608f033cb2777106a29583..e78ab942d113323fecf5510dca85fb5db734efc8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -253,7 +253,7 @@ void compare(bool use_mkldnn = false) { } // Compare result of NativeConfig and AnalysisConfig with memory optimization. -TEST(Analyzer_dam, compare_with_memory_optim) { +TEST(Analyzer_dam, compare_with_static_memory_optim) { // The small dam will core in CI, but works in local. if (FLAGS_max_turn_num == 9) { contrib::AnalysisConfig cfg, cfg1; @@ -263,7 +263,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { SetInput(&input_slots_all); // Run the first time to force to update memory cache SetConfig(&cfg); - cfg.EnableMemoryOptim(true); + cfg.EnableMemoryOptim(true, true /*force update*/); CompareNativeAndAnalysis( reinterpret_cast(&cfg), @@ -271,7 +271,7 @@ TEST(Analyzer_dam, compare_with_memory_optim) { // Run second time to use the memory cache and perform memory optimization. SetConfig(&cfg1); - cfg1.EnableMemoryOptim(); + cfg1.EnableMemoryOptim(true, false /*do not force update*/); CompareNativeAndAnalysis( reinterpret_cast(&cfg1), @@ -279,6 +279,24 @@ TEST(Analyzer_dam, compare_with_memory_optim) { } } +TEST(Analyzer_dam, compare_with_dynamic_memory_optim) { + // The small dam will core in CI, but works in local. + if (FLAGS_max_turn_num == 9) { + contrib::AnalysisConfig cfg, cfg1; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + // Run the first time to force to update memory cache + SetConfig(&cfg); + cfg.EnableMemoryOptim(); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); + } +} + TEST(Analyzer_dam, compare) { compare(); } #ifdef PADDLE_WITH_MKLDNN