diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 26eac939054c1e8bf68e7d9cc16a54dde797d854..12b31da010c34a1e87a0ee449ca1cca2c33f113e 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include #include +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index d99f856d8f46ea760ce07533446ce3bec95d7d27..8de93cf285e4bf34c2d2bf425fa5f3459704b3d6 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -18,8 +18,10 @@ limitations under the License. */ #include #include #include +#include #include #include +#include "paddle/fluid/framework/ir/graph_traits.h" DEFINE_string(print_sub_graph_dir, "", "FLAGS_print_sub_graph_dir is used " @@ -41,7 +43,7 @@ void SortHelper( } } - VLOG(3) << "topology sort insert: " << node->Name() + VLOG(5) << "topology sort insert: " << node->Name() << " " << reinterpret_cast(node) << " input " << node->inputs.size(); ret->push_back(node); } @@ -99,12 +101,13 @@ std::vector TopologySortOperations(const Graph &graph) { return ret; } +// Build operator inlink edge table. std::map> BuildOperationAdjList( const Graph &graph) { std::map> adj_list; for (auto &n : graph.Nodes()) { - if (n->NodeType() != ir::Node::Type::kOperation) continue; + if (!n->IsOp()) continue; if (adj_list.find(n) == adj_list.end()) { adj_list[n] = std::unordered_set(); } @@ -121,6 +124,119 @@ std::map> BuildOperationAdjList( return adj_list; } +// Build operator outlink edge table. +std::map> BuildOperationOutAdjList( + const Graph &graph) { + std::map> adj_list; + + for (auto &n : graph.Nodes()) { + if (!n->IsOp()) continue; + if (adj_list.find(n) == adj_list.end()) { + adj_list[n] = std::unordered_set(); + } + for (auto &var : n->outputs) { + for (auto &adj_n : var->outputs) { + PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); + VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) + << " -> " << n->Name() << reinterpret_cast(n) + << " via " << var->Name() << reinterpret_cast(var); + adj_list[n].insert(adj_n); + } + } + } + return adj_list; +} + +std::vector OpDFSSort(const Graph &graph) { + auto edge_table = BuildOperationOutAdjList(graph); + std::stack stack; + for (auto &ele : edge_table) { + if (ele.first->inputs.empty()) { + // find the input ops (those without input vars) + stack.push(ele.first); + } else { + // find the ops with only persistable vars as inputs. + bool all_persistable = true; + for (auto *input : ele.first->inputs) { + if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) { + all_persistable = false; + } + } + if (all_persistable) { + stack.push(ele.first); + } + } + } + + std::vector res; + // start from the feed op and DFS + std::unordered_set unique_set; + while (!stack.empty()) { + // will start from the last feed by default. + auto cur = stack.top(); + stack.pop(); + unique_set.insert(cur); + res.push_back(cur); + + for (auto *op : edge_table[cur]) { + if (!unique_set.count(op)) { + stack.push(op); + } + } + } + return res; +} + +std::vector TopologyDfsSortOperations(const Graph &graph) { + std::vector nodes; + std::unordered_map in_degree; + + auto set_out_ops_ready = [&](Node *var) { + for (auto *op : var->outputs) { + --in_degree[op]; + } + }; + // build in_degree + for (auto *node : graph.Nodes()) { + if (node->IsOp()) { + in_degree[node] += node->inputs.size(); + } else if (node->IsVar() && node->inputs.empty()) { + // put all the inputs of the whole graph ready. + set_out_ops_ready(node); + } + } + + std::deque op_queue; + // first visit + for (auto &node : OpDFSSort(graph)) { + if (node->IsOp()) { + op_queue.push_back(node); + } + } + + // traverse the graph + int num_ops = op_queue.size(); + while (num_ops) { + for (auto it = op_queue.begin(); it != op_queue.end(); it++) { + auto *&cur_op = *it; + if (!cur_op || in_degree[cur_op] > 0) continue; + // visit this node + // put all the output var of this op valid. + for (auto *out_var : cur_op->outputs) { + if (!out_var) continue; + set_out_ops_ready(out_var); + } + VLOG(8) << "visit " << cur_op->Name(); + nodes.push_back(cur_op); + + cur_op = nullptr; + num_ops--; + } + } + + return nodes; +} + size_t GraphNum(const Graph &graph) { std::unordered_set nodes(graph.Nodes()); std::unordered_set visited_nodes; @@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) { return graph_count; } +void CleanIndividualNodes(Graph *graph) { + std::unordered_set nodes2rm; + for (auto *node : graph->Nodes()) { + if (node->inputs.empty() && node->outputs.empty()) { + nodes2rm.insert(node); + } + } + + for (auto *node : nodes2rm) { + graph->RemoveNode(node); + } +} + +std::vector TopologyVarientSort(const Graph &graph, + SortKind sort_kind) { + switch (sort_kind) { + case SortKind::TS: + return framework::ir::TopologySortOperations(graph); + default: + return framework::ir::TopologyDfsSortOperations(graph); + } +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index be525151f9f9749b913a7e5111e5622d868bd266..fba4936f2c5c971f6c63a452ec4480ff091db25c 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph); // `graph` cannot contain circle. std::vector TopologySortOperations(const Graph &graph); +// Topological sort, but try to DFS. +std::vector TopologyDfsSortOperations(const Graph &graph); + +// Different kinds to sort the operators in a graph to a sequence. +enum class SortKind { + // Topological Search + TS = 0, + // Topological and Depth First Search + TDFS +}; + +// Several kinds of topological sort. +std::vector TopologyVarientSort(const Graph &graph, SortKind sort_kind); + +// Clean the nodes that doesn't connect to others. +void CleanIndividualNodes(Graph *graph); + // Build an adjacency list of operations for the `graph`. std::map> BuildOperationAdjList( const Graph &graph); diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc index 36f36933265c69fcd450894a3e32bbb3e547b62c..3372dcd181d32d9d36eb590c9a4688d1f4c9357b 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.cc +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" - #include "paddle/fluid/framework/program_desc.h" namespace paddle { @@ -29,6 +28,14 @@ namespace ir { std::unique_ptr GraphToProgramPass::ApplyImpl( std::unique_ptr graph) const { + // Remove the unneeded variables after memory optimization. + std::unordered_set vars2remove; + if (graph->Has(kGraphToProgramVarsToRemove)) { + vars2remove = graph->Get>( + kGraphToProgramVarsToRemove); + VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes"; + } + ProgramDesc& program = Get("program"); std::unique_ptr program_pb( @@ -40,25 +47,35 @@ std::unique_ptr GraphToProgramPass::ApplyImpl( std::unordered_set visited_vars; for (ir::Node* n : graph->Nodes()) { if (n->IsVar()) { - if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) { + if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 && + !vars2remove.count(n->Var()->Name())) { visited_vars.insert(n->Var()->Name()); block->add_vars()->MergeFrom(*n->Var()->Proto()); } } } - block->clear_ops(); - std::vector nodes = TopologySortOperations(*graph); + + std::vector nodes; + if (Has(kGraphToProgramSortKind)) { + // Inference Memory Optimize relays on this branch. + int sort_kind = Get(kGraphToProgramSortKind); + nodes = TopologyVarientSort( + *graph, static_cast(sort_kind)); + } else { + nodes = TopologySortOperations(*graph); + } + for (ir::Node* n : nodes) { - if (!n->Op()) { - continue; - } + if (!n->Op()) continue; + block->add_ops()->MergeFrom(*n->Op()->Proto()); } program.CopyFrom(*program_pb); return graph; } + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h index 124ec5a8e771fb768b31fa2e9f5143db96154490..4c36c3a5da13aa9414a55604eb953302e738f014 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.h +++ b/paddle/fluid/framework/ir/graph_to_program_pass.h @@ -20,6 +20,10 @@ namespace paddle { namespace framework { namespace ir { +const char kGraphToProgramVarsToRemove[] = + "__graph_to_program_vars_to_remove__"; +const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__"; + class GraphToProgramPass : public Pass { protected: std::unique_ptr ApplyImpl(std::unique_ptr graph) const override; diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 31ed98db72c8fd4af8c970861d386687962001ce..87a28a2a66c93db763a148801876eb2fb4c61f66 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -135,4 +135,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( } // namespace paddle REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) - .RequirePassAttr(paddle::framework::ir::kGraphVizPath); + .RequirePassAttr(paddle::framework::ir::kGraphVizPath); \ No newline at end of file diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 89dcc677b57eba356c0b6af857f9f8ff6273a683..9eade9eaa8f00fe6e76063344f47968f4e97cf7f 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -64,7 +64,7 @@ class Node { std::string Name() const { return name_; } - VarDesc* Var() { + VarDesc* Var() const { PADDLE_ENFORCE(IsVar()); return var_desc_.get(); } diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index 86e6b1f7d92bc7bc97180e05f6a7c14ab375f92f..a37bb6f4da1fc3baffad36c13c690c6410ac4270 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -50,8 +50,8 @@ void NaiveExecutor::Run() { "running Paddle Inference"; #endif // PADDLE_ON_INFERENCE for (auto &op : ops_) { - VLOG(3) << std::this_thread::get_id() << " run " << op->Type() - << " on scope " << scope_; + VLOG(4) << std::this_thread::get_id() << " run " + << op->DebugStringEx(scope_) << " on scope " << scope_; op->SetIsCalledByExecutor(false); op->Run(*scope_, place_); } @@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id, anc = anc->parent(); } + int num_vars = 0; for (auto &var : global_block.AllVars()) { if (var->Name() == framework::kEmptyVarName) { continue; } + num_vars++; if (persistable == var->Persistable()) { if (persistable) { @@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id, } } } + VLOG(4) << "naive executor create " << num_vars << " vars"; } void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 27b6b80955e45446cd9ea6c8edf29a3173f0263b..7a795bda820dc34f78f33191558fd6c0ccd2cb24 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -18,6 +18,7 @@ cc_library(analysis SRCS analyzer.cc analysis_pass DEPS ${analysis_deps} analysis_helper + ${INFER_IR_PASSES} ) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index c8ed373ee7c32552608d501aa642677f940cd520..d82a063d8808591a7ebf6b70e7421a401ce969f7 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -15,8 +15,8 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include #include -#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h" #include "paddle/fluid/inference/analysis/passes/passes.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { @@ -24,13 +24,16 @@ namespace analysis { Analyzer::Analyzer() {} -void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); } +void Analyzer::Run(Argument *argument) { RunAnalysis(argument); } -void Analyzer::RunIrAnalysis(Argument *argument) { - std::vector passes({"ir_analysis_compose_pass"}); - - for (auto &pass : passes) { - PassRegistry::Global().Retreive(pass)->Run(argument); +void Analyzer::RunAnalysis(Argument *argument) { + PADDLE_ENFORCE(argument->analysis_passes_valid(), + "analsis_passes is not valid in the argument."); + for (auto &pass : argument->analysis_passes()) { + string::PrettyLogH1("--- Running analysis [%s]", pass); + auto *ptr = PassRegistry::Global().Retreive(pass); + PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass); + ptr->Run(argument); } } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index b43e67f20f493cd8151871ca3a36eb6fdadcf9ff..a6de18db60072fc2a5310893a885994d675dd8b6 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -54,7 +54,7 @@ class Analyzer final { DISABLE_COPY_AND_ASSIGN(Analyzer); protected: - void RunIrAnalysis(Argument* argument); + void RunAnalysis(Argument* argument); }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 4c84d02d8679c4d42c0d02ae83e7f869c0f5ce8b..c814ce454840a2c6f3829599b86c9e127d07e4f4 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) { argument.SetModelDir(FLAGS_inference_model_dir); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); argument.SetUseGPU(false); + argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass"}); Analyzer analyser; analyser.Run(&argument); @@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) { argument.SetModelDir(FLAGS_inference_model_dir); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); argument.SetUseGPU(false); + argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass"}); Analyzer analyser; analyser.Run(&argument); diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2d8980b1d15d89cdf9c243a57188a0acb354940d..88ce61f9b928aba1945bddc1f9f6b785834780ca 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -110,16 +110,20 @@ struct Argument { // The overall Scope to work on. DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope); + // The default program, loaded from disk. DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc); // The ir passes to perform in analysis phase. DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, std::vector); + DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses, + std::vector); // Pass a set of op types to enable its mkldnn kernel DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, std::unordered_set); + // Passed from config. DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); @@ -127,6 +131,13 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); + // Memory optimized related. + DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); + DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool); + // Indicate which kind of sort algorithm is used for operators, the memory + // optimization relays on the sort algorithm. + DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int); + // The program transformed by IR analysis phase. DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, framework::proto::ProgramDesc); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 269a0da9f9378601373e42d741f519843b111ec6..de04713b531dc421b885473cc8956e8ba6b63574 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -28,6 +28,13 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/port.h" +#ifdef _WIN32 +#define GCC_ATTRIBUTE(attr__) ; +#else +#define GCC_ATTRIBUTE(attr__) __attribute__((attr__)); +#endif +#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result) + namespace paddle { namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index e37fea38bcb2b1f514347ecbfe7072abb6f07455..4e1464226450b833e6d8dae2be2dcad89dd1e5e4 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -83,6 +83,7 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { PADDLE_ENFORCE(graph.get()); // Apply all the passes for (const auto &pass : passes_) { + if (pass->Type() == "graph_viz_pass") continue; PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); graph = pass->Apply(std::move(graph)); } diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index 9ae5b8aa173b85904df360eb196aefe5af08c6aa..eb6e1768a2c01f1388962eefe8e70368cae8cf8b 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) -if (TENSORRT_FOUND) +if (WITH_GPU AND TENSORRT_FOUND) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) set(analysis_deps ${analysis_deps} diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc index b6a5dfd087c95d0ccb0f5adfa4f754cfa5a44f14..a64f85ee9ac1a7bb8f0ed7bb8678166bbbcd5746 100644 --- a/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc +++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_detector.cc @@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() { auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)(); for (auto &subgraph : subgraphs) { if (subgraph.size() <= (size_t)min_subgraph_size_) continue; - LOG(INFO) << "detect a subgraph size " << subgraph.size(); std::unordered_set subgraph_uniq(subgraph.begin(), subgraph.end()); // replace this sub-graph with the first node. Two steps: 1. Create a Block // Node that contains this subgraph 2. Mark the nodes inside the sub-graph diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index bc06e78ae6997b0d4d0456c15d6e4158efdad300..5f25303cc1eaa6b563f0f8f4289b38499eb487cc 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" +#include "paddle/fluid/string/pretty_log.h" namespace paddle { namespace inference { @@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, framework::BlockDesc block_desc(nullptr, &block_proto); block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_idx(0); + string::PrettyLogDetail("--- detect a sub-graph with %d nodes", + subgraph.size()); + for (auto *node : subgraph) { auto *op = block_desc.AppendOp(); *op->Proto() = *node->Op()->Proto(); diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index add9b70f2cd960a94232b35edb928ab4115cbff0..691c336ebe4b6a6cb60023859b21665b6a4756a8 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -1,11 +1,18 @@ cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) +cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) -cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass) +cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass) + +cc_library(analysis_passes SRCS passes.cc DEPS + ir_graph_build_pass + ir_analysis_pass + ir_params_sync_among_devices_pass + memory_optim_pass + ir_graph_to_program_pass +) set(analysis_deps ${analysis_deps} - ir_graph_build_pass - ir_analysis_pass analysis_passes subgraph_detector CACHE INTERNAL "") diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc deleted file mode 100644 index 490189e550760b4de62724e685dd07f6e521445e..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h" -#include -#include -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/inference/analysis/ir_pass_manager.h" -#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" -#include "paddle/fluid/string/pretty_log.h" - -namespace paddle { -namespace inference { -namespace analysis { - -void IrAnalysisComposePass::RunImpl(Argument *argument) { - ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); - ApplyIrPasses(argument); - CollectFusionStatis(argument); -} - -std::string IrAnalysisComposePass::repr() const { - return "ir-analysis-compose-pass"; -} - -void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { - std::vector passes({ - "ir_graph_build_pass", "ir_analysis_pass", - "ir_params_sync_among_devices_pass", - }); - for (const auto &pass : passes) { - VLOG(2) << "Run pass " << pass; - auto *the_pass = PassRegistry::Global().Retreive(pass); - the_pass->Run(argument); - } -} - -void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) { - if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) { - LOG(INFO) << "argument has no fuse statis"; - return; - } - argument->SetFusionStatis( - argument->main_graph().Get( - framework::ir::kFuseStatisAttr)); -} - -} // namespace analysis -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc index e327bd39f0ae0b8fbe3b189e4bb26a23c44d910c..d986811a827b6ed477b30bc43d26f52a71e8f178 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h" namespace paddle { @@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) { IRPassManager the_ir_manager(argument); graph = the_ir_manager.Apply(std::move(graph)); PADDLE_ENFORCE_GT(graph->Nodes().size(), 0); - argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc( - the_ir_manager.AcquireProgram(&graph, argument->main_program()))); argument->SetMainGraph(graph.release()); + CollectFusionStatis(argument); +} + +void IrAnalysisPass::CollectFusionStatis(Argument* argument) { + if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) { + LOG(INFO) << "argument has no fuse statis"; + return; + } + argument->SetFusionStatis( + argument->main_graph().Get( + framework::ir::kFuseStatisAttr)); } std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; } diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h index d8a7449807585257c153d3c8958555ea2306afa3..2c2113c06d917b5473e68935889e4a7b16b6cfc1 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.h @@ -29,6 +29,9 @@ namespace analysis { class IrAnalysisPass : public AnalysisPass { public: void RunImpl(Argument* argument) override; + + void CollectFusionStatis(Argument* argument); + std::string repr() const override; }; diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f1da37af3cc5fa55eb66a1822aefe96eda1dc4fb --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void IrGraphToProgramPass::RunImpl(Argument *argument) { + auto pass = + framework::ir::PassRegistry::Instance().Get("graph_to_program_pass"); + + if (argument->memory_optim_sort_kind_valid()) { + pass->Set(framework::ir::kGraphToProgramSortKind, + new int(argument->memory_optim_sort_kind())); + } + + std::unique_ptr graph(argument->main_graph_ptr()); + framework::ProgramDesc desc(argument->main_program()); + pass->SetNotOwned("program", &desc); + auto thegraph = pass->Apply(std::move(graph)); + thegraph.release(); // the argument still own the graph. + + argument->SetIrAnalyzedProgram( + new framework::proto::ProgramDesc(*desc.Proto())); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h similarity index 59% rename from paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h rename to paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h index 16c6b7d84df88d0ebbc06b547c75a45dcb0c2440..838ebdbc9d71eb3a73882e4c4c3e4bf6078150e4 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h @@ -14,31 +14,17 @@ #pragma once -#include -#include #include "paddle/fluid/inference/analysis/analysis_pass.h" -#include "paddle/fluid/inference/analysis/passes/passes.h" namespace paddle { namespace inference { namespace analysis { -/* - * The analysis pass to run a list of IR passes (like a function call). - * Currently, it should be the first pass of analysis phase. - */ -class IrAnalysisComposePass : public AnalysisPass { +class IrGraphToProgramPass : public AnalysisPass { public: - void RunImpl(Argument* argument) override; - std::string repr() const override; + void RunImpl(Argument *argument) override; - private: - void ApplyIrPasses(Argument* argument); - - void CollectFusionStatis(Argument* argument); - - // Assign a Scope for IR passes to modify the weights. - void AssignScopeToModify(Argument* argument); + std::string repr() const override { return "ir-graph-to-param-pass"; } }; } // namespace analysis diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..57683c0b727ef1c922e3a308db28d0af4f193602 --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc @@ -0,0 +1,647 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include "paddle/fluid/framework/ir/graph_traits.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { +namespace analysis { + +using framework::ir::Graph; +using framework::ir::Node; +using framework::ir::TopologyVarientSort; +using space_table_t = MemoryOptimizePass::space_table_t; + +// Collect the lifecycles of the tensors. +// Traverse the graph in topological order. +// The traversal order also affect the lifecycles, so different sort_kind is +// used. +void MemoryOptimizePass::CollectLifeCycle( + std::unordered_map* lifecycles, + int sort_kind) const { + max_lifecycle_ = 0; + for (auto* op_node : framework::ir::TopologyVarientSort( + *graph_, static_cast(sort_kind))) { + if (!op_node->IsOp()) continue; + auto reads = op_node->inputs; + auto writes = op_node->outputs; + + std::vector requires(reads.begin(), reads.end()); + requires.insert(requires.end(), writes.begin(), writes.end()); + + // Disable reuse of feed variables. + if (op_node->Name() == "feed") { + for (auto* node : op_node->outputs) { + auto var = node->Name(); + lifecycles->emplace(var, + std::make_pair(0, std::numeric_limits::max())); + } + } else { + // Normal operators. + for (const Node* node : requires) { + if (node->Var()->Persistable()) continue; + std::string var = node->Name(); + if (!lifecycles->count(var)) { + (*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_); + } else { + (*lifecycles)[var].second = + std::max(max_lifecycle_, lifecycles->at(var).second); // max() + } + } + } + + ++max_lifecycle_; + } +} + +// TODO(Superjomn) Make this a general help method. +int DataTypeToSpace(framework::proto::VarType_Type type) { + switch (type) { + case framework::proto::VarType_Type_BOOL: + return sizeof(bool); + case framework::proto::VarType_Type_FP32: + return sizeof(float); + case framework::proto::VarType_Type_INT32: + return sizeof(int32_t); + case framework::proto::VarType_Type_INT64: + return sizeof(int64_t); + default: + PADDLE_THROW("Unknown data type"); + } +} + +// Collect the memory size of the tensors. +void MemoryOptimizePass::CollectVarMemorySize( + const std::unordered_map& batch_var_ave_dim, + std::unordered_map* tensor_nodes, + space_table_t* space_table) const { + // Collect tensors from graph. + for (auto* node : graph_->Nodes()) { + if (node->IsVar() && + node->Var()->GetType() == + framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) { + // Parameters will not be reused. + if (node->Var()->Persistable()) continue; + (*tensor_nodes)[node->Name()] = node; + (*space_table)[node->Name()] = + DataTypeToSpace(node->Var()->GetDataType()) * + batch_var_ave_dim.at(node->Name()); + } + } +} + +// Find a sutable (big enough but smallest to avoid memory waste). +// +// Args: +// @tensor_nodes: the tensor nodes in the ir::Graph. +// @free_existing_tensors: the allocated tensor and are free. +// @space_table: the memory space of tensors. +// @tensor2use: the tensor that requires memory. +// +// Returns: +// true if found some existing tensor to reuse. +// false if no sutable tensor to reuse, one need to allocate a new tensor for +// this requirement. +// The suitable tensor for reuse is one that is approximately equal to the +// memory demand. +bool FindSuitableTensorToReuse( + const std::string& tensor, int space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + const space_table_t& space_table, + const std::vector>& var_clusters, + std::string* tensor2use) __SHOULD_USE_RESULT__; + +bool FindSuitableTensorToReuse( + const std::string& tensor, int space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + const space_table_t& space_table, + const std::vector>& var_clusters, + std::string* tensor2use) { + std::pair best_fit; + best_fit.second = std::numeric_limits::max(); + VLOG(5) << "Split Tensors to " << var_clusters.size() << " clusters"; + + // find the cluster this var belongs to. + const std::unordered_set* cluster = nullptr; + for (const auto& c : var_clusters) { + if (c.count(tensor)) { + cluster = &c; + break; + } + } + PADDLE_ENFORCE_NOT_NULL(cluster, + "something wrong in memory optimization, the " + "variable %s not in the clusters.", + tensor); + + for (auto& candidate : *free_existing_tensors) { + // This is not a temporary tensor. + if (!space_table.count(candidate)) continue; + // Not in the same cluster. + if (!cluster->count(candidate)) continue; + + size_t space = space_table.at(candidate); + size_t space_diff = std::abs(space - space_required); + if (space_diff < best_fit.second) { + best_fit.first = candidate; + best_fit.second = space_diff; + } + } + + if (best_fit.second < std::numeric_limits::max()) { + *tensor2use = best_fit.first; + return true; + } + return false; +} + +// Allocate new tensor instead of reusing the existing one. +void AllocateNewTensor( + const std::string& name, size_t space_required, + const std::unordered_map& tensor_nodes, + std::unordered_set* free_existing_tensors, + space_table_t* space_table, + std::unordered_map* reuse_table) { + // The newly born tensor is free to be used. + free_existing_tensors->insert(name); + // Register the space it has. + PADDLE_ENFORCE(space_table->count(name)); + space_table->at(name) = std::max(space_table->at(name), space_required); + // The allocated new tensor use the memory of itself. + (*reuse_table)[name] = name; +} + +// Free a tensor and make it resuable. +// @tensor: the tensor to free. +// @free_existing_tensors: the free and allocated tensors. +// @reuse_table: a map from a fake tensor to the existing allocated tensor. +void FreeATensor(const std::string& tensor, + std::unordered_set* free_existing_tensors, + std::unordered_map* reuse_table) { + if (tensor == "feed" || tensor == "fetch") return; + // the really allocated tensor. + const auto& free_tensor = reuse_table->at(tensor); + + free_existing_tensors->insert(free_tensor); +} + +// Reuse a free existing tensor. +void ReuseATensor(const std::string& tensor, const std::string& tensor2reuse, + size_t memory_size, + std::unordered_set* free_existing_tensors, + std::unordered_map* reuse_table, + space_table_t* reused_space_table) { + auto it = free_existing_tensors->find(tensor2reuse); + PADDLE_ENFORCE(it != free_existing_tensors->end()); + free_existing_tensors->erase(it); + (*reuse_table)[tensor] = tensor2reuse; + // Update the memory size of a reused tensor, the memory will grow if the + // required memory is larger. + (*reused_space_table)[tensor2reuse] = + std::max(reused_space_table->at(tensor2reuse), memory_size); +} + +// Calculate the memory usage. +void EvaluateMemoryUsage( + const std::unordered_map& reuse_table, + const space_table_t& space_table, + const std::unordered_map& var_batch_ave_size, + size_t* allocated, size_t* saved) { + *allocated = 0; + *saved = 0; + + for (auto elem : reuse_table) { + if (elem.first == elem.second) { + *allocated += space_table.at(elem.first); + VLOG(4) << elem.first << " <-> " << elem.second << " " + << space_table.at(elem.first) << " " + << space_table.at(elem.second); + } else { + *saved += space_table.at(elem.first); + VLOG(4) << "reuse " << elem.first << " -> " << elem.second; + } + } + VLOG(4) << "allocated " << *allocated; + VLOG(4) << "saved " << *saved; +} + +// Return saved ratio. +void MemoryOptimizePass::MakeReusePlan( + const std::vector>& var_clusters, + const std::unordered_map& var_batch_ave_size, + const space_table_t& space_table, + std::unordered_map* reuse_table, int sort_kind, + MemoryAllocation* memory_allocation) const { + // Clear the existing plan. + reuse_table->clear(); + + // The `space_table` stores the real memory size for each tensor. + // The `reused_space_table` stores the maximum memory size required by a + // tensor during the memory reusing, the small tensor might be reused by a + // larger tensor, and the memory size of the small one will grow. + auto reused_space_table = space_table; + + std::unordered_map life_cycles; + std::unordered_map tensor_nodes; + // The allocated tensors whose memory can be reused, they will live across the + // program execution. + std::unordered_set existing_tensors; + // The existing tensor that has been allocated, and is also free to reuse. + std::unordered_set free_existing_tensors; + + CollectLifeCycle(&life_cycles, sort_kind); + + for (int age = 0; age < max_lifecycle_; ++age) { + std::unordered_set born_tensors; + std::unordered_set dead_tensors; + // Gather the dead and born tensors. + for (auto elem_it = life_cycles.begin(); elem_it != life_cycles.end(); + elem_it++) { + if (elem_it->second.first == -1) { + continue; + } + const auto& tensor = elem_it->first; + const auto& lifecycle = elem_it->second; + VLOG(4) << "process " << tensor << " reuse " << lifecycle.first << "->" + << lifecycle.second; + + // Collect newly born tensors. + if (lifecycle.first == age) { + born_tensors.insert(tensor); + } + // Collect dead tensors whose memory can be reused. + else if (lifecycle.second < age) { // NOLINT + dead_tensors.insert(tensor); + // remove to avoid duplicate process. + elem_it->second.first = -1; // avoid duplicate search + } + } + + // Reuse the dead tensors for born tensors + for (const auto& tensor : born_tensors) { + // Skip the feed and fetch tensor for that they share data with others. + std::string tensor2reuse; + if (!space_table.count(tensor)) continue; + size_t space_required = space_table.at(tensor); + if (FindSuitableTensorToReuse(tensor, space_required, tensor_nodes, + &free_existing_tensors, reused_space_table, + var_clusters, &tensor2reuse)) { + if (tensor != tensor2reuse) { + VLOG(4) << tensor << " -> " << tensor2reuse; + } + ReuseATensor(tensor, tensor2reuse, space_required, + &free_existing_tensors, reuse_table, &reused_space_table); + } else { + VLOG(4) << "allocate " << tensor; + AllocateNewTensor(tensor, space_required, tensor_nodes, + &free_existing_tensors, &reused_space_table, + reuse_table); + ReuseATensor(tensor, tensor, space_required, &free_existing_tensors, + reuse_table, &reused_space_table); + } + } + + for (const auto& tensor : dead_tensors) { + // free its memory. + FreeATensor(tensor, &free_existing_tensors, reuse_table); + } + } + + EvaluateMemoryUsage(*reuse_table, reused_space_table, var_batch_ave_size, + &(memory_allocation->allocated), + &(memory_allocation->saved)); + memory_allocation->sort_kind = sort_kind; +} + +void BuildVarNodeTable(Graph* graph, + std::unordered_map* var_node_table) { + for (auto* node : graph->Nodes()) { + if (node->IsVar()) { + (*var_node_table)[node->Name()] = node; + } + } +} + +// NOTE The optimized opdesc doesn't match ir::Graph. +void UpdateOpDescsByReuse( + Graph* graph, + const std::unordered_map& reuse_table, + int sort_kind) { + // TODO(Superjomn) change here to be compatible with the runtime order. + for (auto* node : TopologyVarientSort( + *graph, static_cast(sort_kind))) { + if (node->IsOp()) { + // Replace the original inputs/outputs with the reused tensors. + std::unordered_map> in_args, + out_args; + for (auto argument : node->Op()->Inputs()) { + for (const auto& x : argument.second) { + auto name = x; + if (reuse_table.count(x) && reuse_table.at(x) != x) { + name = reuse_table.at(x); + } + in_args[argument.first].push_back(name); + VLOG(4) << node->Name() << " input " << x << " -> " << name; + } + } + + for (auto argument : node->Op()->Outputs()) { + for (const auto& x : argument.second) { + auto name = x; + if (reuse_table.count(x) && reuse_table.at(x) != x) { + name = reuse_table.at(x); + } + out_args[argument.first].push_back(name); + VLOG(4) << node->Name() << " output " << x << " -> " << name; + } + } + + // Update arguments. + for (auto& arg : in_args) { + node->Op()->SetInput(arg.first, arg.second); + } + for (auto& arg : out_args) { + node->Op()->SetOutput(arg.first, arg.second); + } + node->Op()->Flush(); + } + } +} + +void MemoryOptimizePass::PerformReusePlan( + const std::unordered_map& reuse_table, + int sort_kind, std::unordered_set* vars2remove) const { + std::unordered_map var_node_table; + BuildVarNodeTable(graph_, &var_node_table); + UpdateOpDescsByReuse(graph_, reuse_table, sort_kind); + + for (auto& item : reuse_table) { + if (item.first != item.second) { + vars2remove->insert(item.first); + } + } + VLOG(2) << "to remove vars " << vars2remove->size(); +} + +std::vector split(const std::string& line, char delim) { + std::vector res; + std::string field; + std::stringstream line_stream(line); + while (std::getline(line_stream, field, delim)) { + res.emplace_back(field); + } + return res; +} + +// Deserialize the batch var shapes from the cache file. +std::vector>> DeseralizeBatchVarShapes( + const std::string& path) { + std::ifstream file(path); + PADDLE_ENFORCE(file.is_open(), "failed to open %s to read cache", path); + std::string line; + std::vector>> batch_shapes; + + while (std::getline(file, line)) { + std::map> batch; + for (const auto& var_info : split(line, ';')) { + auto fields = split(var_info, ':'); + PADDLE_ENFORCE_EQ(fields.size(), 2UL); + auto var_name = fields.front(); + auto shape_str = split(fields[1], ','); + std::vector shape; + for (const auto& v : shape_str) shape.push_back(std::stoi(v)); + batch[var_name] = shape; + } + batch_shapes.push_back(batch); + } + return batch_shapes; +} + +// Calculate the average dim of each tensor from the batch shape cache. +std::unordered_map GetBatchAverageSize( + const std::vector>>& batches) { + std::unordered_map var2size; + // The average size of the batches for each variable. + int num_batch = 0; + for (const auto& batch : batches) { + num_batch++; + for (const auto& item : batch) { + int dim = std::accumulate(item.second.begin(), item.second.end(), 1, + [](int a, int b) { return a * b; }); + var2size[item.first] += dim; + } + } + + for (auto& item : var2size) { + item.second /= num_batch; + } + + return var2size; +} + +// Analysis the batch shapes loading from the cache file. +// By splitting the variables to different clusters by analyzing their batch +// size, we can pre-schedule the changes of difference LoDTensor when different +// length of input sequences is entered. +// This should works fine for the models operating on sentences. +std::vector> AnalysisBatchShapesByBatchSize( + const std::vector>>& batches) { + // collect the batch size of each shape and combine to a stringstream in + // converient to generate a hash. + std::unordered_map var_batchsize_hashes; + for (auto& batch : batches) { + for (auto& ele : batch) { + int batch_size = ele.second.front(); + // TODO(Superjomn) might consume large memory here, use combine hash. + var_batchsize_hashes[ele.first] << batch_size; + } + } + + // Split to sets by batch size sequences. + std::unordered_map> + shape_sets; + for (auto& ele : var_batchsize_hashes) { + auto hash = std::hash()(ele.second.str()); + shape_sets[hash].insert(ele.first); + } + std::vector> res; + for (auto& ele : shape_sets) { + res.emplace_back(std::move(ele.second)); + } + + VLOG(3) << "Cluster by batch_size and get " << res.size() << " clusters"; + return res; +} + +// Analysis the batch shapes loading from the cache file, and split them to +// different clusters by their size. +// This should works fine for the overall models. +std::vector> AnalysisBatchShapesBySimilarSize( + const space_table_t& space_table, + const std::vector>>& batches, + int interval = 200000) { + PADDLE_ENFORCE_GT(interval, 0); + // cluster to different clusters. + size_t max_size = 0; + for (auto& item : space_table) { + max_size = std::max(item.second, max_size); + } + VLOG(4) << "tensor max size " << max_size; + + std::vector> res; + + // cluster by intervals. + for (size_t interval_size = 0; interval_size <= max_size; + interval_size += interval) { + std::unordered_set cluster; + for (auto& item : space_table) { + if (interval_size <= item.second && + interval_size + interval > item.second) { + cluster.insert(item.first); + } + } + if (!cluster.empty()) { + res.push_back(cluster); + } + } + + VLOG(3) << "Cluster by interval and get " << res.size() << " cluster"; + return res; +} + +std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; } + +void MemoryOptimizePass::RunImpl(Argument* argument) { + // When force update, should not optimize memory. + if (!argument->enable_memory_optim() || argument->memory_optim_force_update()) + return; + graph_ = argument->main_graph_ptr(); + + auto path = GetMemoryCachePath( + argument->model_dir_valid() ? argument->model_dir() : "", + argument->model_program_path_valid() ? argument->model_program_path() + : ""); + VLOG(3) << "Load memory cache from " << path; + if (inference::IsFileExists(path)) { + VLOG(4) << "Performing memory optimize"; + auto batches = DeseralizeBatchVarShapes(path); + auto var_batch_ave_size = GetBatchAverageSize(batches); + + std::unordered_map tensor_nodes; + space_table_t space_table; + CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table); + + std::unordered_map reuse_table; + double max_saving_ratio = 0.; + + std::vector> strategies; + + for (int sort_kind = 0; sort_kind < 2; sort_kind++) { + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_batch_size = + AnalysisBatchShapesByBatchSize(batches); + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_batch_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, 1024); // interval 1kb + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, 1024 * 1024); // interval 1MB + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + + strategies.emplace_back([&, sort_kind] { + auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize( + space_table, batches, + std::numeric_limits::max()); // no intervals + MemoryAllocation allocation; + MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size, + space_table, &reuse_table, sort_kind, &allocation); + return allocation; + }); + } + + std::function* best_strategy{nullptr}; + + // Try all strategies to get the best result. + for (auto& strategy : strategies) { + auto allocation = strategy(); + string::PrettyLogDetail("--- get strategy saving %f memory for workspace", + allocation.GetSavingRatio()); + if (allocation.GetSavingRatio() > max_saving_ratio) { + max_saving_ratio = allocation.GetSavingRatio(); + best_strategy = &strategy; + } + } + if (!best_strategy) { + LOG(ERROR) + << "This model makes poor memory optimize, skip memory optimize"; + return; + } + auto memory_allocation = (*best_strategy)(); + + string::PrettyLogH2( + "--- Saved %.2f%s memory for workspace(temporary variables)", + memory_allocation.GetSavingRatio() * 100, "%"); + string::PrettyLogDetail("--- Allocated %d MB", + memory_allocation.allocated / 1024. / 1024.); + string::PrettyLogDetail("--- Saved %d MB", + memory_allocation.saved / 1024. / 1024.); + argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove, + new std::unordered_set); + auto& vars2remove = + argument->main_graph().Get>( + framework::ir::kGraphToProgramVarsToRemove); + + PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove); + argument->SetMemoryOptimSortKind(memory_allocation.sort_kind); + } +} + +float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const { + return (saved / 1024.) / (allocated / 1024. + saved / 1024.); +} +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..fa1ad9c8c6aeff60ec4468f41140c57be790af7f --- /dev/null +++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.h @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/analysis/analysis_pass.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +/* + * Memory optimization pass for inference with pre-analysis of memory usage + * without GC. + * Different from training, the inference memory reuse strategies doesn't + * include GC for that overhead is too much when batch size equals one. + * + * The inference memory reuse tries to pre-determine the tensor reusing strategy + * without runtime overhead. + * + * To improve the strategy's performance, a warm-up running is introduced: + * - Before officially deploy the inference program, one should warm it up and + * generate some runtime cache, + * - Run the inference program with several batches of data, it will persist + * some runtime information about memory of tensors to disk, we call the + * information the memory reusing cache, + * - With the memory reusing cache, user can deploy the inference to a + * service, before running the model, the inference program will load the + * memory cache, analysis it and generate the best memory reusing strategy, + * and adjust the execution of the network. + * + * With the warm-up and memory reusing cache design, the memory reusing + * algorithm can analysis the real memory consume of the tensors, even with the + * flexible LoDTensor and special shape changing operators such as + * sequence-pooling. + */ +class MemoryOptimizePass : public AnalysisPass { + public: + using space_table_t = std::unordered_map; + using lifecycle_t = std::pair; + + struct MemoryAllocation { + size_t allocated; // allocated memory in byte. + size_t saved; // saved memory in byte. + int sort_kind; // the kind of the corresponding sorting algorithm. + + // Get the memory saving ratio of temporary variables. + float GetSavingRatio() const; + }; + + virtual ~MemoryOptimizePass() = default; + + protected: + void RunImpl(Argument *argument) override; + + private: + void CollectLifeCycle( + std::unordered_map *lifecycles, + int sort_kind) const; + + void CollectVarMemorySize( + const std::unordered_map &batch_var_ave_dim, + std::unordered_map *tensor_nodes, + space_table_t *space_table) const; + + // Returns percentage of saved memory. + void MakeReusePlan( + const std::vector> &var_clusters, + const std::unordered_map &var_batch_ave_size, + const space_table_t &space_table, + std::unordered_map *reuse_table, int sort_kind, + MemoryAllocation *memory_allocation) const; + + void PerformReusePlan( + const std::unordered_map &reuse_table, + int sort_kind, std::unordered_set *vars2remove) const; + + public: + std::string repr() const override; + + private: + mutable framework::ir::Graph *graph_{nullptr}; + mutable int max_lifecycle_{-1}; +}; + +static std::string GetMemoryCachePath(const std::string &model_path, + const std::string &prog_path) { + auto path = model_path.empty() ? prog_path : model_path; + return path + ".memory_cache"; +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc index 9245e32cee28473c21e2acbc1c64165d8b475d3b..161b127d6d5ceb3e8b9c1cf98c69eb0387bfb905 100644 --- a/paddle/fluid/inference/analysis/passes/passes.cc +++ b/paddle/fluid/inference/analysis/passes/passes.cc @@ -13,24 +13,31 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/passes.h" -#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" +#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" namespace paddle { namespace inference { namespace analysis { + PassRegistry::PassRegistry() { + // Register manually to avoid the trivial `USE_OP` like macro for easier use + // and link. passes_.emplace("ir_analysis_pass", std::unique_ptr(new IrAnalysisPass)); passes_.emplace("ir_graph_build_pass", std::unique_ptr(new IrGraphBuildPass)); - passes_.emplace("ir_analysis_compose_pass", - std::unique_ptr(new IrAnalysisComposePass)); + passes_.emplace("memory_optimize_pass", + std::unique_ptr(new MemoryOptimizePass)); passes_.emplace( "ir_params_sync_among_devices_pass", std::unique_ptr(new IrParamsSyncAmongDevicesPass)); + passes_.emplace( + "ir_graph_to_program_pass", + std::unique_ptr(new IrGraphToProgramPass)); } } // namespace analysis diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 8b3838f69a89498648c1cf5cb9573d1f68034fe2..ad0af4005ad154d2f5c67d00dec9d7ec397eb662 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -18,8 +18,10 @@ if(APPLE) endif(APPLE) -set(inference_deps paddle_inference_api paddle_fluid_api analysis pass - ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB}) +set(inference_deps ${analysis_deps} + paddle_inference_api paddle_fluid_api + analysis pass naive_executor + ${GLOB_PASS_LIB}) if(WITH_GPU AND TENSORRT_FOUND) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) @@ -29,7 +31,8 @@ add_subdirectory(details) cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) -cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager) +cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor + reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps}) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config analysis_config paddle_pass_builder zero_copy_tensor @@ -44,7 +47,7 @@ if(WITH_TESTING) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) endif() -cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps} ARGS --dirname=${WORD2VEC_MODEL_DIR}) if (WITH_ANAKIN AND WITH_MKL) # only needed in CI diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 965bbd0fd26ce39f72b622bce0ecb7b3bbdf4f2f..f9da3004ed8306ef08144d096afa4f86133e492d 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const { contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) { model_dir_ = model_dir; + + Update(); } contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file, const std::string ¶ms_file) { prog_file_ = prog_file; params_file_ = params_file; + + Update(); } void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path, const std::string ¶ms_file_path) { prog_file_ = prog_file_path; params_file_ = params_file_path; + + Update(); } void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id) { @@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, memory_pool_init_size_mb_ = memory_pool_init_size_mb; device_id_ = device_id; #else - LOG(ERROR) << "Please compile with gpu to EnableGpu"; + LOG(ERROR) << "Please compile with gpu to EnableGpu()"; use_gpu_ = false; #endif + + Update(); +} +void contrib::AnalysisConfig::DisableGpu() { + use_gpu_ = false; + + Update(); } -void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; } contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { #define CP_MEMBER(member__) member__ = other.member__; @@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { CP_MEMBER(use_gpu_); CP_MEMBER(device_id_); CP_MEMBER(memory_pool_init_size_mb_); + + CP_MEMBER(enable_memory_optim_); + CP_MEMBER(memory_optim_force_update_); // TensorRT releated. CP_MEMBER(use_tensorrt_); CP_MEMBER(tensorrt_workspace_size_); @@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { } #undef CP_MEMBER + + Update(); } void contrib::AnalysisConfig::EnableMKLDNN() { @@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() { LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; use_mkldnn_ = false; #endif + + Update(); } void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, int max_batch_size, int min_subgraph_size) { +#ifdef PADDLE_WITH_CUDA + if (!use_gpu()) { + LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; + return; + } + use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; + Update(); +#else + LOG(ERROR) + << "To use TensorRT engine, please compile inference lib with GPU first."; +#endif } +// TODO(Superjomn) refactor this, buggy. void contrib::AnalysisConfig::Update() { auto info = SerializeInfoCache(); if (info == serialized_info_cache_) return; - if (use_gpu_) { - pass_builder_.reset(new GpuPassStrategy); + // Transfer pass_builder and copy the existing compatible passes. + if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) { + if (use_gpu()) { + pass_builder_.reset(new GpuPassStrategy); + + if (use_tensorrt_) { + // Append after the Affine_channel_conv_fuse pass. + pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); + } + } else { + pass_builder_.reset(new CpuPassStrategy); + } + } else { - pass_builder_.reset(new CpuPassStrategy); + if (use_gpu()) { + pass_builder_.reset(new GpuPassStrategy( + *static_cast(pass_builder_.get()))); + + } else { + pass_builder_.reset(new CpuPassStrategy( + *static_cast(pass_builder_.get()))); + } } if (use_tensorrt_) { - if (!use_gpu_) { - LOG(ERROR) - << "TensorRT engine is not available when EnableGpu() not actived."; - } else { + const auto &passes = pass_builder_->AllPasses(); + if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") == + std::end(passes)) { // Append after the Affine_channel_conv_fuse pass. pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); } @@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() { #endif } + if (enable_memory_optim_) { + pass_builder()->AppendAnalysisPass("memory_optimize_pass"); + } + if (ir_debug_) { pass_builder()->TurnOnDebug(); } @@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() { std::string contrib::AnalysisConfig::SerializeInfoCache() { std::stringstream ss; + ss << model_dir_; + ss << prog_file_; + ss << params_file_; + ss << use_gpu_; + ss << device_id_; ss << memory_pool_init_size_mb_; ss << use_tensorrt_; ss << tensorrt_workspace_size_; ss << tensorrt_max_batchsize_; + ss << tensorrt_min_subgraph_size_; + + ss << enable_memory_optim_; + ss << memory_optim_force_update_; ss << use_mkldnn_; + for (auto &item : mkldnn_enabled_op_types_) ss << item; + ss << ";"; + + ss << model_from_memory_; + ss << enable_ir_optim_; ss << use_feed_fetch_ops_; ss << ir_debug_; + ss << specify_input_name_; + ss << cpu_math_library_num_threads_; + return ss.str(); } void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads( int cpu_math_library_num_threads) { cpu_math_library_num_threads_ = cpu_math_library_num_threads; + + Update(); } float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { @@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { #endif } +void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) { + enable_memory_optim_ = true; + memory_optim_force_update_ = force_update_cache; + + Update(); +} + +bool contrib::AnalysisConfig::enable_memory_optim() const { + return enable_memory_optim_; +} + void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, size_t prog_buffer_size, const char *param_buffer, @@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); model_from_memory_ = true; + + Update(); } } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3917b9b65b5905be38ba8a236aa158f42586c825..2b0cad5faa0e31cb7546d405e05e36754915f653 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -24,18 +24,21 @@ #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" -#if PADDLE_WITH_TENSORRT -#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" -#endif #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" +#if PADDLE_WITH_TENSORRT +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#endif + DECLARE_bool(profile); namespace paddle { @@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector &inputs, LOG(ERROR) << "fail to get fetches"; return false; } + + // Collect variable shapes for memory optimization. + if (need_collect_var_shapes_for_memory_optim()) { + CollectVarShapes(); + } + VLOG(3) << "predict cost: " << timer.toc() << "ms"; // All the containers in the scope will be hold in inference, but the @@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseGPU(config_.use_gpu()); argument_.SetGPUDeviceId(config_.gpu_device_id()); + argument_.SetEnableMemoryOptim(config_.enable_memory_optim()); + argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir().empty()) { @@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { + LOG(INFO) << "TensorRT subgraph engine is enabled"; argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); @@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } if (config_.use_mkldnn_) { + LOG(INFO) << "MKLDNN is enabled"; argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); } auto passes = config_.pass_builder()->AllPasses(); - if (!config_.ir_optim()) passes.clear(); + if (!config_.ir_optim()) { + passes.clear(); + LOG(INFO) << "ir_optim is turned off, no IR pass will be executed"; + } argument_.SetIrAnalysisPasses(passes); + argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses()); argument_.SetScopeNotOwned(const_cast(scope_.get())); Analyzer().Run(&argument_); @@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() { if (sub_scope_) { scope_->DeleteScope(sub_scope_); } + + // TODO(Superjomn) deduce the directory path. + std::string out_path = inference::analysis::GetMemoryCachePath( + config_.model_dir(), config_.prog_file()); + if (need_collect_var_shapes_for_memory_optim()) { + SerializeBatchVarShapes(out_path); + } } std::unique_ptr AnalysisPredictor::Clone() { @@ -567,6 +591,66 @@ std::unique_ptr AnalysisPredictor::Clone() { return std::unique_ptr(x); } +void AnalysisPredictor::CollectVarShapes() { + VLOG(4) << "Collecting var shapes"; + if (batch_var_shapes_.size() >= max_shape_collect_count_) return; + std::map> var_shapes; + for (auto var_name : inference_program_->Block(0).LocalVarNames()) { + auto *var = sub_scope_->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL(var); + if (var->Type() == framework::VarTypeTrait::kId || + var->Type() == framework::VarTypeTrait::kId) { + auto &tensor = var->Get(); + auto shape = framework::vectorize(tensor.dims()); + var_shapes[var_name].assign(shape.begin(), shape.end()); + } + } + batch_var_shapes_.push_back(var_shapes); + LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size() + << " batch of var shapes for analysis"; +} + +void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) { + LOG(INFO) << "serialize batch var shapes to " << path; + std::ofstream file(path); + if (!file.is_open()) { + LOG(ERROR) << "failed to serialize the var shapes to " << path; + return; + } + + // The sirialized data format: + // :dim0,dim1,dim2,; + for (auto &batch : batch_var_shapes_) { + for (auto &ele : batch) { + file << ele.first << ":"; + for (size_t i = 0; i < ele.second.size() - 1; i++) { + file << ele.second[i] << ","; + } + file << ele.second.back() << ";"; + } + file << "\n"; + } +} + +bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { + if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_; + bool need = false; + // check if the cache exists + if (!config_.enable_memory_optim()) { + need = false; + } else if (config_.enable_memory_optim() && + !inference::IsFileExists(inference::analysis::GetMemoryCachePath( + config_.model_dir(), config_.prog_file()))) { + need = true; + } else if (config_.enable_memory_optim() && + config_.memory_optim_force_update_) { + need = true; + } + + need_collect_var_shapes_ = need ? 1 : 0; + return need; +} + template <> std::unique_ptr CreatePaddlePredictor( const contrib::AnalysisConfig &config) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 6ca4b5e9bed7505fca3b833dfbb7026ff550d258..e25b5a7047b849eace7e89b3181766dc940f83c1 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor { void SetMkldnnThreadID(int tid); protected: + // For memory optimization. + bool need_collect_var_shapes_for_memory_optim(); + void CollectVarShapes(); + void SerializeBatchVarShapes(const std::string &path); + bool PrepareProgram(const std::shared_ptr &program); bool PrepareScope(const std::shared_ptr &parent_scope); bool CreateExecutor(); @@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor { // A mutex help to make Clone thread safe. std::mutex clone_mutex_; + // For memory optimization. + const size_t max_shape_collect_count_{1000}; + int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true. + std::vector>> batch_var_shapes_; + private: // Some status here that help to determine the status inside the predictor. bool status_program_optimized_{false}; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 3df26cde3d5defac97074c9bc4086e81f9ec0c93..4688e93d7102109d2c7ece9ba37bc8f2d311dcf1 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -16,8 +16,10 @@ #include #include #include // NOLINT +#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" DEFINE_string(dirname, "", "dirname to tests."); @@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) { } } +TEST(AnalysisPredictor, memory_optim) { + AnalysisConfig config(FLAGS_dirname); + config.DisableGpu(); + config.EnableMemoryOptim(true); + config.pass_builder()->TurnOnDebug(); + + auto native_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); + + // 2. Dummy Input Data + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data.Reset(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + std::vector inputs(4, tensor); + std::vector output, output1; + + { + // The first predictor help to cache the memory optimize strategy. + auto predictor = CreatePaddlePredictor(config); + + // Run several times to check the parameters are not reused by mistake. + for (int i = 0; i < 5; i++) { + ASSERT_TRUE(predictor->Run(inputs, &output)); + } + } + + { + output.clear(); + // The second predictor to perform memory optimization. + config.EnableMemoryOptim(false); + auto predictor = CreatePaddlePredictor(config); + + // Run with memory optimization + ASSERT_TRUE(predictor->Run(inputs, &output)); + } + + // Run native + ASSERT_TRUE(native_predictor->Run(inputs, &output1)); + + LOG(INFO) << "the output " << inference::DescribeTensor(output.front()); + LOG(INFO) << "the native output " + << inference::DescribeTensor(output1.front()); + + inference::CompareResult(output, output1); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh index 9811fe2cd0621708bf896b10a036049dfc4d3898..963986f245cdafa737d76953f0e5323e4f74e669 100755 --- a/paddle/fluid/inference/api/demo_ci/run.sh +++ b/paddle/fluid/inference/api/demo_ci/run.sh @@ -1,3 +1,4 @@ +#!/bin/bash set -x PADDLE_ROOT=$1 TURN_ON_MKL=$2 # use MKL or Openblas diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index cdd01cb9f06424b39d17e192f9a924451ad1daaf..b92781e4f2c612cbb39fcaa7c80b6051a67215fd 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -15,7 +15,10 @@ #pragma once #include - +#include +#if !defined(_WIN32) +#include +#endif #include #include // NOLINT #include @@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { return true; } -static std::string DescribeTensor(const PaddleTensor &tensor) { +static std::string DescribeTensor(const PaddleTensor &tensor, + int max_num_of_data = 15) { std::stringstream os; os << "Tensor [" << tensor.name << "]\n"; os << " - type: "; @@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, } } +static bool IsFileExists(const std::string &path) { + std::ifstream file(path); + bool exists = file.is_open(); + file.close(); + return exists; +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index ae6ac69854d91d44567ccd985791de5fd2b16f26..1cee8904500636d7b49e6b4e54595dbce6a79954 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -192,6 +192,13 @@ struct AnalysisConfig { */ bool model_from_memory() const { return model_from_memory_; } + /** Turn on memory optimize + * NOTE still in development, will release latter. + */ + void EnableMemoryOptim(bool force_update_cache = false); + /** Tell whether the memory optimization is activated. */ + bool enable_memory_optim() const; + friend class ::paddle::AnalysisPredictor; /** NOTE just for developer, not an official API, easily to be broken. @@ -232,6 +239,10 @@ struct AnalysisConfig { // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; + // memory reuse related. + bool enable_memory_optim_{false}; + bool memory_optim_force_update_{false}; + bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index bc3ce72f0832c4bf029f86e023bd9ff11f6578bd..039389a4cf99da6c2576c148d8c294e5d79aa7a8 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/api/paddle_pass_builder.h" + #include namespace paddle { @@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() { LOG(ERROR) << "GPU not support MKLDNN yet"; } +void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) { + analysis_passes_.push_back(pass); +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index efe1ba106a2fcbac66a773e56b98d1a6452f4013..d3a60d209922ebe8d31723ca25c71a952ea08bd6 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -45,6 +45,9 @@ class PaddlePassBuilder { /** Delete all the passes that has type `pass_type`. */ void DeletePass(const std::string &pass_type); + /** Append an analysis pass. */ + void AppendAnalysisPass(const std::string &pass); + /** Visualize the computation graph after each pass by generating a DOT * language file, one can draw them with the Graphviz toolkit. */ @@ -54,8 +57,18 @@ class PaddlePassBuilder { std::string DebugString(); const std::vector &AllPasses() const { return passes_; } + std::vector AnalysisPasses() const { + auto passes = analysis_passes_; + // To make sure the ir_graph_to_program should be the last pass so any + // modication of IR will persist to the program. + passes.push_back("ir_graph_to_program_pass"); + return passes; + } protected: + std::vector analysis_passes_{ + {"ir_graph_build_pass", "ir_analysis_pass", + "ir_params_sync_among_devices_pass"}}; std::vector passes_; }; @@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder { /** The MKLDNN control exists in both CPU and GPU mode, because there can be * still some CPU kernels running in CPU mode. */ - virtual void EnableMKLDNN() = 0; + virtual void EnableMKLDNN() {} bool use_gpu() const { return use_gpu_; } @@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder { protected: bool use_gpu_{false}; + bool use_mkldnn_{false}; }; /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode. @@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy { use_gpu_ = false; } + explicit CpuPassStrategy(const CpuPassStrategy &other) + : PassStrategy(other.AllPasses()) {} + virtual ~CpuPassStrategy() = default; void EnableMKLDNN() override { // TODO(Superjomn) Consider the way to mix CPU with GPU. #ifdef PADDLE_WITH_MKLDNN - passes_.insert(passes_.begin(), "mkldnn_placement_pass"); - - for (auto &pass : - std::vector({"depthwise_conv_mkldnn_pass", // - "conv_bias_mkldnn_fuse_pass", // - "conv3d_bias_mkldnn_fuse_pass", // - "conv_relu_mkldnn_fuse_pass", // - "conv_elementwise_add_mkldnn_fuse_pass"})) { - passes_.push_back(pass); + if (!use_mkldnn_) { + passes_.insert(passes_.begin(), "mkldnn_placement_pass"); + + for (auto &pass : std::vector( + {"depthwise_conv_mkldnn_pass", // + "conv_bias_mkldnn_fuse_pass", // + "conv3d_bias_mkldnn_fuse_pass", // + "conv_relu_mkldnn_fuse_pass", // + "conv_elementwise_add_mkldnn_fuse_pass"})) { + passes_.push_back(pass); + } } + use_mkldnn_ = true; +#else + use_mkldnn_ = false; #endif } - - CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {} }; /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. @@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy { use_gpu_ = true; } - GpuPassStrategy(const GpuPassStrategy &other) + explicit GpuPassStrategy(const GpuPassStrategy &other) : PassStrategy(other.AllPasses()) { use_gpu_ = true; } diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index adbf98e9e8a535938157ae5c8214d8bfffbc3314..423c39813f05af0d6aaade184914e6777c9b8a83 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -19,7 +19,7 @@ endfunction() function(inference_analysis_api_test target install_dir filename) inference_analysis_test(${target} SRCS ${filename} - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt) endfunction() @@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 # normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL) +inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL) # small DAM set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index fc87e0a8d1765f5a5c5bcfd21764b65165f35dc3..4ec9404ab42bcd9cc0608f033cb2777106a29583 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -126,6 +126,7 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, std::string turn_mask_pre = "turn_mask_"; auto one_batch = data->NextBatch(); + PADDLE_ENFORCE(!one_batch.response.empty()); int size = one_batch.response[0].size(); CHECK_EQ(size, kMaxTurnLen); // turn tensor assignment @@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) { std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); @@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) { reinterpret_cast(&cfg), input_slots_all); } +// Compare result of NativeConfig and AnalysisConfig with memory optimization. +TEST(Analyzer_dam, compare_with_memory_optim) { + // The small dam will core in CI, but works in local. + if (FLAGS_max_turn_num == 9) { + contrib::AnalysisConfig cfg, cfg1; + DataRecord data(FLAGS_infer_data, FLAGS_batch_size); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + // Run the first time to force to update memory cache + SetConfig(&cfg); + cfg.EnableMemoryOptim(true); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), + input_slots_all); + + // Run second time to use the memory cache and perform memory optimization. + SetConfig(&cfg1); + cfg1.EnableMemoryOptim(); + + CompareNativeAndAnalysis( + reinterpret_cast(&cfg1), + input_slots_all); + } +} + TEST(Analyzer_dam, compare) { compare(); } + #ifdef PADDLE_WITH_MKLDNN TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); } #endif diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 7b448a3200351f902df277f7a653cf7114becba0..2db297e2005c6b657259187d6b6b76657d9e4388 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -69,6 +69,7 @@ void SetInput(std::vector> *inputs) { TEST(Analyzer_Text_Classification, profile) { AnalysisConfig cfg; SetConfig(&cfg); + cfg.pass_builder()->TurnOnDebug(); std::vector outputs; std::vector> input_slots_all; @@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) { TEST(Analyzer_Text_Classification, compare) { AnalysisConfig cfg; SetConfig(&cfg); + cfg.EnableMemoryOptim(); std::vector> input_slots_all; SetInput(&input_slots_all); diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index 5a77b53a8513cdbef5620d36ba5e0722ae993916..f3e75ffbb5962885bd926af50b764bec561cc454 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include #include "paddle/fluid/inference/tests/api/tester_helper.h" @@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) { FLAGS_infer_model + "/__params__"); cfg->DisableGpu(); cfg->SwitchIrDebug(); - cfg->SwitchSpecifyInputNames(); + cfg->SwitchSpecifyInputNames(false); // TODO(TJ): fix fusion gru cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); } @@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) { if (use_mkldnn) { cfg.EnableMKLDNN(); } + // cfg.pass_builder()->TurnOnDebug(); std::vector outputs; std::vector> input_slots_all; @@ -103,9 +105,8 @@ void profile(bool use_mkldnn = false) { size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); CHECK_EQ(numel, refer.data.size()); for (size_t i = 0; i < numel; ++i) { - CHECK_LT( - fabs(static_cast(output.data.data())[i] - refer.data[i]), - 1e-5); + EXPECT_NEAR(static_cast(output.data.data())[i], refer.data[i], + 1e-5); } } } diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index ac964dc0c863daf0c0917c638e80745a3f672e41..d2ca1d0b0098fd377725cc0fdf002d8d2e3539ec 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -15,6 +15,7 @@ #pragma once #include + #include #include #include // NOLINT @@ -28,9 +29,8 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/analysis_predictor.h" -#include "paddle/fluid/inference/api/paddle_inference_pass.h" - #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/utils/benchmark.h" @@ -91,7 +91,7 @@ void CompareResult(const std::vector &outputs, float *pdata = static_cast(out.data.data()); float *pdata_ref = static_cast(ref_out.data.data()); for (size_t j = 0; j < size; ++j) { - EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy); + CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy); } break; } diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 9725c1903293b2ce4f6c4baeb1e6d63af28c1c3b..5aca807ee3aee1bb323abe6d5c3700dfc08e30b4 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) { } } +TEST(TensorRT_mobilenet, profile) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + profile(model_dir, true, false); +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/utils/benchmark.h b/paddle/fluid/inference/utils/benchmark.h index 76a3dd2c2992ebdf2528c539b3d161f558b34a08..a1304cf4e7778f74e15e4fe5e2f405fd3c185eb4 100644 --- a/paddle/fluid/inference/utils/benchmark.h +++ b/paddle/fluid/inference/utils/benchmark.h @@ -11,8 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. -#pragma once +#pragma once #include #include #include diff --git a/paddle/fluid/inference/utils/benchmark_tester.cc b/paddle/fluid/inference/utils/benchmark_tester.cc index eb255474082b27180a8b3176b5f880c0d38f6c3b..80763160df3adfd8c34e66bc7a5370808b349e76 100644 --- a/paddle/fluid/inference/utils/benchmark_tester.cc +++ b/paddle/fluid/inference/utils/benchmark_tester.cc @@ -16,7 +16,7 @@ #include #include -using namespace paddle::inference; +using namespace paddle::inference; // NOLINT TEST(Benchmark, basic) { Benchmark benchmark; benchmark.SetName("key0"); @@ -36,4 +36,4 @@ TEST(Benchmark, PersistToFile) { benchmark.PersistToFile("1.log"); benchmark.PersistToFile("1.log"); benchmark.PersistToFile("1.log"); -} \ No newline at end of file +} diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 86b3114cb3c452cd5942cc86dcf0f5e768f216a4..0dfed7f5cc1e929c1fb566df1a7dfb4b2450323b 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase { << out_name; auto &feed_list = feed_var->Get(); + PADDLE_ENFORCE_LT(static_cast(col), feed_list.size()); auto &feed_item = feed_list.at(static_cast(col)); auto *out_item = out_var->GetMutable(); diff --git a/paddle/fluid/string/pretty_log.h b/paddle/fluid/string/pretty_log.h index 10c9eb80d0a7e07d5974ca10d740e71e7717b5c5..da4c1f326fbc2703e639279d79acb52dc748266a 100644 --- a/paddle/fluid/string/pretty_log.h +++ b/paddle/fluid/string/pretty_log.h @@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt, std::cerr << style << Sprintf(fmt, args...) << reset(); } +template +static void PrettyLogInfo(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::info(), fmt, args...); +} +template +static void PrettyLogDetail(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::detail(), fmt, args...); +} +template +static void PrettyLogH1(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::H1(), fmt, args...); +} +template +static void PrettyLogH2(const char *fmt, const Args &... args) { + PrettyLogEndl(Style::H2(), fmt, args...); +} + } // namespace string } // namespace paddle