未验证 提交 885c4e57 编写于 作者: Y Yan Chunwei 提交者: GitHub

fea/infer memory optim2 (#14953)

上级 6597ccb0
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
namespace paddle { namespace paddle {
......
...@@ -18,8 +18,10 @@ limitations under the License. */ ...@@ -18,8 +18,10 @@ limitations under the License. */
#include <fstream> #include <fstream>
#include <iosfwd> #include <iosfwd>
#include <ostream> #include <ostream>
#include <stack>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include "paddle/fluid/framework/ir/graph_traits.h"
DEFINE_string(print_sub_graph_dir, "", DEFINE_string(print_sub_graph_dir, "",
"FLAGS_print_sub_graph_dir is used " "FLAGS_print_sub_graph_dir is used "
...@@ -41,7 +43,7 @@ void SortHelper( ...@@ -41,7 +43,7 @@ void SortHelper(
} }
} }
VLOG(3) << "topology sort insert: " << node->Name() VLOG(5) << "topology sort insert: " << node->Name() << " "
<< reinterpret_cast<void *>(node) << " input " << node->inputs.size(); << reinterpret_cast<void *>(node) << " input " << node->inputs.size();
ret->push_back(node); ret->push_back(node);
} }
...@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) { ...@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
return ret; return ret;
} }
// Build operator inlink edge table.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
const Graph &graph) { const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list; std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
for (auto &n : graph.Nodes()) { for (auto &n : graph.Nodes()) {
if (n->NodeType() != ir::Node::Type::kOperation) continue; if (!n->IsOp()) continue;
if (adj_list.find(n) == adj_list.end()) { if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>(); adj_list[n] = std::unordered_set<ir::Node *>();
} }
...@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( ...@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
return adj_list; return adj_list;
} }
// Build operator outlink edge table.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
for (auto &n : graph.Nodes()) {
if (!n->IsOp()) continue;
if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>();
}
for (auto &var : n->outputs) {
for (auto &adj_n : var->outputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var);
adj_list[n].insert(adj_n);
}
}
}
return adj_list;
}
std::vector<ir::Node *> OpDFSSort(const Graph &graph) {
auto edge_table = BuildOperationOutAdjList(graph);
std::stack<Node *> stack;
for (auto &ele : edge_table) {
if (ele.first->inputs.empty()) {
// find the input ops (those without input vars)
stack.push(ele.first);
} else {
// find the ops with only persistable vars as inputs.
bool all_persistable = true;
for (auto *input : ele.first->inputs) {
if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) {
all_persistable = false;
}
}
if (all_persistable) {
stack.push(ele.first);
}
}
}
std::vector<Node *> res;
// start from the feed op and DFS
std::unordered_set<Node *> unique_set;
while (!stack.empty()) {
// will start from the last feed by default.
auto cur = stack.top();
stack.pop();
unique_set.insert(cur);
res.push_back(cur);
for (auto *op : edge_table[cur]) {
if (!unique_set.count(op)) {
stack.push(op);
}
}
}
return res;
}
std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph) {
std::vector<ir::Node *> nodes;
std::unordered_map<Node *, int> in_degree;
auto set_out_ops_ready = [&](Node *var) {
for (auto *op : var->outputs) {
--in_degree[op];
}
};
// build in_degree
for (auto *node : graph.Nodes()) {
if (node->IsOp()) {
in_degree[node] += node->inputs.size();
} else if (node->IsVar() && node->inputs.empty()) {
// put all the inputs of the whole graph ready.
set_out_ops_ready(node);
}
}
std::deque<Node *> op_queue;
// first visit
for (auto &node : OpDFSSort(graph)) {
if (node->IsOp()) {
op_queue.push_back(node);
}
}
// traverse the graph
int num_ops = op_queue.size();
while (num_ops) {
for (auto it = op_queue.begin(); it != op_queue.end(); it++) {
auto *&cur_op = *it;
if (!cur_op || in_degree[cur_op] > 0) continue;
// visit this node
// put all the output var of this op valid.
for (auto *out_var : cur_op->outputs) {
if (!out_var) continue;
set_out_ops_ready(out_var);
}
VLOG(8) << "visit " << cur_op->Name();
nodes.push_back(cur_op);
cur_op = nullptr;
num_ops--;
}
}
return nodes;
}
size_t GraphNum(const Graph &graph) { size_t GraphNum(const Graph &graph) {
std::unordered_set<ir::Node *> nodes(graph.Nodes()); std::unordered_set<ir::Node *> nodes(graph.Nodes());
std::unordered_set<ir::Node *> visited_nodes; std::unordered_set<ir::Node *> visited_nodes;
...@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) { ...@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
return graph_count; return graph_count;
} }
void CleanIndividualNodes(Graph *graph) {
std::unordered_set<Node *> nodes2rm;
for (auto *node : graph->Nodes()) {
if (node->inputs.empty() && node->outputs.empty()) {
nodes2rm.insert(node);
}
}
for (auto *node : nodes2rm) {
graph->RemoveNode(node);
}
}
std::vector<Node *> TopologyVarientSort(const Graph &graph,
SortKind sort_kind) {
switch (sort_kind) {
case SortKind::TS:
return framework::ir::TopologySortOperations(graph);
default:
return framework::ir::TopologyDfsSortOperations(graph);
}
}
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph); ...@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
// `graph` cannot contain circle. // `graph` cannot contain circle.
std::vector<ir::Node *> TopologySortOperations(const Graph &graph); std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
// Topological sort, but try to DFS.
std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph);
// Different kinds to sort the operators in a graph to a sequence.
enum class SortKind {
// Topological Search
TS = 0,
// Topological and Depth First Search
TDFS
};
// Several kinds of topological sort.
std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
// Clean the nodes that doesn't connect to others.
void CleanIndividualNodes(Graph *graph);
// Build an adjacency list of operations for the `graph`. // Build an adjacency list of operations for the `graph`.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList( std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
const Graph &graph); const Graph &graph);
......
...@@ -20,7 +20,6 @@ limitations under the License. */ ...@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
namespace paddle { namespace paddle {
...@@ -29,6 +28,14 @@ namespace ir { ...@@ -29,6 +28,14 @@ namespace ir {
std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl( std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
std::unique_ptr<Graph> graph) const { std::unique_ptr<Graph> graph) const {
// Remove the unneeded variables after memory optimization.
std::unordered_set<std::string> vars2remove;
if (graph->Has(kGraphToProgramVarsToRemove)) {
vars2remove = graph->Get<std::unordered_set<std::string>>(
kGraphToProgramVarsToRemove);
VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes";
}
ProgramDesc& program = Get<ProgramDesc>("program"); ProgramDesc& program = Get<ProgramDesc>("program");
std::unique_ptr<proto::ProgramDesc> program_pb( std::unique_ptr<proto::ProgramDesc> program_pb(
...@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl( ...@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
std::unordered_set<std::string> visited_vars; std::unordered_set<std::string> visited_vars;
for (ir::Node* n : graph->Nodes()) { for (ir::Node* n : graph->Nodes()) {
if (n->IsVar()) { if (n->IsVar()) {
if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) { if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
!vars2remove.count(n->Var()->Name())) {
visited_vars.insert(n->Var()->Name()); visited_vars.insert(n->Var()->Name());
block->add_vars()->MergeFrom(*n->Var()->Proto()); block->add_vars()->MergeFrom(*n->Var()->Proto());
} }
} }
} }
block->clear_ops(); block->clear_ops();
std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
for (ir::Node* n : nodes) { std::vector<ir::Node*> nodes;
if (!n->Op()) { if (Has(kGraphToProgramSortKind)) {
continue; // Inference Memory Optimize relays on this branch.
int sort_kind = Get<int>(kGraphToProgramSortKind);
nodes = TopologyVarientSort(
*graph, static_cast<framework::ir::SortKind>(sort_kind));
} else {
nodes = TopologySortOperations(*graph);
} }
for (ir::Node* n : nodes) {
if (!n->Op()) continue;
block->add_ops()->MergeFrom(*n->Op()->Proto()); block->add_ops()->MergeFrom(*n->Op()->Proto());
} }
program.CopyFrom(*program_pb); program.CopyFrom(*program_pb);
return graph; return graph;
} }
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -20,6 +20,10 @@ namespace paddle { ...@@ -20,6 +20,10 @@ namespace paddle {
namespace framework { namespace framework {
namespace ir { namespace ir {
const char kGraphToProgramVarsToRemove[] =
"__graph_to_program_vars_to_remove__";
const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
class GraphToProgramPass : public Pass { class GraphToProgramPass : public Pass {
protected: protected:
std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override; std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
......
...@@ -64,7 +64,7 @@ class Node { ...@@ -64,7 +64,7 @@ class Node {
std::string Name() const { return name_; } std::string Name() const { return name_; }
VarDesc* Var() { VarDesc* Var() const {
PADDLE_ENFORCE(IsVar()); PADDLE_ENFORCE(IsVar());
return var_desc_.get(); return var_desc_.get();
} }
......
...@@ -50,8 +50,8 @@ void NaiveExecutor::Run() { ...@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
"running Paddle Inference"; "running Paddle Inference";
#endif // PADDLE_ON_INFERENCE #endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) { for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type() VLOG(4) << std::this_thread::get_id() << " run "
<< " on scope " << scope_; << op->DebugStringEx(scope_) << " on scope " << scope_;
op->SetIsCalledByExecutor(false); op->SetIsCalledByExecutor(false);
op->Run(*scope_, place_); op->Run(*scope_, place_);
} }
...@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id, ...@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
anc = anc->parent(); anc = anc->parent();
} }
int num_vars = 0;
for (auto &var : global_block.AllVars()) { for (auto &var : global_block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) { if (var->Name() == framework::kEmptyVarName) {
continue; continue;
} }
num_vars++;
if (persistable == var->Persistable()) { if (persistable == var->Persistable()) {
if (persistable) { if (persistable) {
...@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id, ...@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
} }
} }
} }
VLOG(4) << "naive executor create " << num_vars << " vars";
} }
void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id, void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
......
...@@ -18,6 +18,7 @@ cc_library(analysis SRCS ...@@ -18,6 +18,7 @@ cc_library(analysis SRCS
analyzer.cc analyzer.cc
analysis_pass analysis_pass
DEPS ${analysis_deps} analysis_helper DEPS ${analysis_deps} analysis_helper
${INFER_IR_PASSES}
) )
cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h" #include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -24,13 +24,16 @@ namespace analysis { ...@@ -24,13 +24,16 @@ namespace analysis {
Analyzer::Analyzer() {} Analyzer::Analyzer() {}
void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); } void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
void Analyzer::RunIrAnalysis(Argument *argument) { void Analyzer::RunAnalysis(Argument *argument) {
std::vector<std::string> passes({"ir_analysis_compose_pass"}); PADDLE_ENFORCE(argument->analysis_passes_valid(),
"analsis_passes is not valid in the argument.");
for (auto &pass : passes) { for (auto &pass : argument->analysis_passes()) {
PassRegistry::Global().Retreive(pass)->Run(argument); string::PrettyLogH1("--- Running analysis [%s]", pass);
auto *ptr = PassRegistry::Global().Retreive(pass);
PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
ptr->Run(argument);
} }
} }
......
...@@ -54,7 +54,7 @@ class Analyzer final { ...@@ -54,7 +54,7 @@ class Analyzer final {
DISABLE_COPY_AND_ASSIGN(Analyzer); DISABLE_COPY_AND_ASSIGN(Analyzer);
protected: protected:
void RunIrAnalysis(Argument* argument); void RunAnalysis(Argument* argument);
}; };
} // namespace analysis } // namespace analysis
......
...@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) { ...@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
argument.SetModelDir(FLAGS_inference_model_dir); argument.SetModelDir(FLAGS_inference_model_dir);
argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
argument.SetUseGPU(false); argument.SetUseGPU(false);
argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass"});
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
...@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) { ...@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
argument.SetModelDir(FLAGS_inference_model_dir); argument.SetModelDir(FLAGS_inference_model_dir);
argument.SetIrAnalysisPasses({"infer_clean_graph_pass"}); argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
argument.SetUseGPU(false); argument.SetUseGPU(false);
argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass"});
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
......
...@@ -110,16 +110,20 @@ struct Argument { ...@@ -110,16 +110,20 @@ struct Argument {
// The overall Scope to work on. // The overall Scope to work on.
DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope); DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
// The default program, loaded from disk.
DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc); DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
// The ir passes to perform in analysis phase. // The ir passes to perform in analysis phase.
DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
std::vector<std::string>); std::vector<std::string>);
DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
std::vector<std::string>);
// Pass a set of op types to enable its mkldnn kernel // Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
std::unordered_set<std::string>); std::unordered_set<std::string>);
// Passed from config.
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
...@@ -127,6 +131,13 @@ struct Argument { ...@@ -127,6 +131,13 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
// Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
// The program transformed by IR analysis phase. // The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram, DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
framework::proto::ProgramDesc); framework::proto::ProgramDesc);
......
...@@ -28,6 +28,13 @@ limitations under the License. */ ...@@ -28,6 +28,13 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#ifdef _WIN32
#define GCC_ATTRIBUTE(attr__) ;
#else
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#endif
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
......
...@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) { ...@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
PADDLE_ENFORCE(graph.get()); PADDLE_ENFORCE(graph.get());
// Apply all the passes // Apply all the passes
for (const auto &pass : passes_) { for (const auto &pass : passes_) {
if (pass->Type() == "graph_viz_pass") continue;
PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
graph = pass->Apply(std::move(graph)); graph = pass->Apply(std::move(graph));
} }
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
if (TENSORRT_FOUND) if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
set(analysis_deps ${analysis_deps} set(analysis_deps ${analysis_deps}
......
...@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() { ...@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)(); auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) { for (auto &subgraph : subgraphs) {
if (subgraph.size() <= (size_t)min_subgraph_size_) continue; if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
LOG(INFO) << "detect a subgraph size " << subgraph.size();
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end()); std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block // replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph // Node that contains this subgraph 2. Mark the nodes inside the sub-graph
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
...@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, ...@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
framework::BlockDesc block_desc(nullptr, &block_proto); framework::BlockDesc block_desc(nullptr, &block_proto);
block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0); block_desc.Proto()->set_idx(0);
string::PrettyLogDetail("--- detect a sub-graph with %d nodes",
subgraph.size());
for (auto *node : subgraph) { for (auto *node : subgraph) {
auto *op = block_desc.AppendOp(); auto *op = block_desc.AppendOp();
*op->Proto() = *node->Op()->Proto(); *op->Proto() = *node->Op()->Proto();
......
cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager) cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass) cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
set(analysis_deps ${analysis_deps} cc_library(analysis_passes SRCS passes.cc DEPS
ir_graph_build_pass ir_graph_build_pass
ir_analysis_pass ir_analysis_pass
ir_params_sync_among_devices_pass
memory_optim_pass
ir_graph_to_program_pass
)
set(analysis_deps ${analysis_deps}
analysis_passes analysis_passes
subgraph_detector subgraph_detector
CACHE INTERNAL "") CACHE INTERNAL "")
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
namespace paddle { namespace paddle {
...@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) { ...@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
IRPassManager the_ir_manager(argument); IRPassManager the_ir_manager(argument);
graph = the_ir_manager.Apply(std::move(graph)); graph = the_ir_manager.Apply(std::move(graph));
PADDLE_ENFORCE_GT(graph->Nodes().size(), 0); PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
the_ir_manager.AcquireProgram(&graph, argument->main_program())));
argument->SetMainGraph(graph.release()); argument->SetMainGraph(graph.release());
CollectFusionStatis(argument);
}
void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
LOG(INFO) << "argument has no fuse statis";
return;
}
argument->SetFusionStatis(
argument->main_graph().Get<Argument::fusion_statis_t>(
framework::ir::kFuseStatisAttr));
} }
std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; } std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
......
...@@ -29,6 +29,9 @@ namespace analysis { ...@@ -29,6 +29,9 @@ namespace analysis {
class IrAnalysisPass : public AnalysisPass { class IrAnalysisPass : public AnalysisPass {
public: public:
void RunImpl(Argument* argument) override; void RunImpl(Argument* argument) override;
void CollectFusionStatis(Argument* argument);
std::string repr() const override; std::string repr() const override;
}; };
......
...@@ -12,49 +12,32 @@ ...@@ -12,49 +12,32 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include <string> #include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
void IrAnalysisComposePass::RunImpl(Argument *argument) { void IrGraphToProgramPass::RunImpl(Argument *argument) {
ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); auto pass =
ApplyIrPasses(argument); framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
CollectFusionStatis(argument);
}
std::string IrAnalysisComposePass::repr() const {
return "ir-analysis-compose-pass";
}
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { if (argument->memory_optim_sort_kind_valid()) {
std::vector<std::string> passes({ pass->Set(framework::ir::kGraphToProgramSortKind,
"ir_graph_build_pass", "ir_analysis_pass", new int(argument->memory_optim_sort_kind()));
"ir_params_sync_among_devices_pass",
});
for (const auto &pass : passes) {
VLOG(2) << "Run pass " << pass;
auto *the_pass = PassRegistry::Global().Retreive(pass);
the_pass->Run(argument);
} }
}
void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) { std::unique_ptr<Graph> graph(argument->main_graph_ptr());
if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) { framework::ProgramDesc desc(argument->main_program());
LOG(INFO) << "argument has no fuse statis"; pass->SetNotOwned("program", &desc);
return; auto thegraph = pass->Apply(std::move(graph));
} thegraph.release(); // the argument still own the graph.
argument->SetFusionStatis(
argument->main_graph().Get<Argument::fusion_statis_t>( argument->SetIrAnalyzedProgram(
framework::ir::kFuseStatisAttr)); new framework::proto::ProgramDesc(*desc.Proto()));
} }
} // namespace analysis } // namespace analysis
......
...@@ -14,31 +14,17 @@ ...@@ -14,31 +14,17 @@
#pragma once #pragma once
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
/* class IrGraphToProgramPass : public AnalysisPass {
* The analysis pass to run a list of IR passes (like a function call).
* Currently, it should be the first pass of analysis phase.
*/
class IrAnalysisComposePass : public AnalysisPass {
public: public:
void RunImpl(Argument* argument) override; void RunImpl(Argument *argument) override;
std::string repr() const override;
private: std::string repr() const override { return "ir-graph-to-param-pass"; }
void ApplyIrPasses(Argument* argument);
void CollectFusionStatis(Argument* argument);
// Assign a Scope for IR passes to modify the weights.
void AssignScopeToModify(Argument* argument);
}; };
} // namespace analysis } // namespace analysis
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* Memory optimization pass for inference with pre-analysis of memory usage
* without GC.
* Different from training, the inference memory reuse strategies doesn't
* include GC for that overhead is too much when batch size equals one.
*
* The inference memory reuse tries to pre-determine the tensor reusing strategy
* without runtime overhead.
*
* To improve the strategy's performance, a warm-up running is introduced:
* - Before officially deploy the inference program, one should warm it up and
* generate some runtime cache,
* - Run the inference program with several batches of data, it will persist
* some runtime information about memory of tensors to disk, we call the
* information the memory reusing cache,
* - With the memory reusing cache, user can deploy the inference to a
* service, before running the model, the inference program will load the
* memory cache, analysis it and generate the best memory reusing strategy,
* and adjust the execution of the network.
*
* With the warm-up and memory reusing cache design, the memory reusing
* algorithm can analysis the real memory consume of the tensors, even with the
* flexible LoDTensor and special shape changing operators such as
* sequence-pooling.
*/
class MemoryOptimizePass : public AnalysisPass {
public:
using space_table_t = std::unordered_map<std::string, size_t>;
using lifecycle_t = std::pair<int, int>;
struct MemoryAllocation {
size_t allocated; // allocated memory in byte.
size_t saved; // saved memory in byte.
int sort_kind; // the kind of the corresponding sorting algorithm.
// Get the memory saving ratio of temporary variables.
float GetSavingRatio() const;
};
virtual ~MemoryOptimizePass() = default;
protected:
void RunImpl(Argument *argument) override;
private:
void CollectLifeCycle(
std::unordered_map<std::string, lifecycle_t> *lifecycles,
int sort_kind) const;
void CollectVarMemorySize(
const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
space_table_t *space_table) const;
// Returns percentage of saved memory.
void MakeReusePlan(
const std::vector<std::unordered_set<std::string>> &var_clusters,
const std::unordered_map<std::string, size_t> &var_batch_ave_size,
const space_table_t &space_table,
std::unordered_map<std::string, std::string> *reuse_table, int sort_kind,
MemoryAllocation *memory_allocation) const;
void PerformReusePlan(
const std::unordered_map<std::string, std::string> &reuse_table,
int sort_kind, std::unordered_set<std::string> *vars2remove) const;
public:
std::string repr() const override;
private:
mutable framework::ir::Graph *graph_{nullptr};
mutable int max_lifecycle_{-1};
};
static std::string GetMemoryCachePath(const std::string &model_path,
const std::string &prog_path) {
auto path = model_path.empty() ? prog_path : model_path;
return path + ".memory_cache";
}
} // namespace analysis
} // namespace inference
} // namespace paddle
...@@ -13,24 +13,31 @@ ...@@ -13,24 +13,31 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/passes/passes.h" #include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h" #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
PassRegistry::PassRegistry() { PassRegistry::PassRegistry() {
// Register manually to avoid the trivial `USE_OP` like macro for easier use
// and link.
passes_.emplace("ir_analysis_pass", passes_.emplace("ir_analysis_pass",
std::unique_ptr<AnalysisPass>(new IrAnalysisPass)); std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
passes_.emplace("ir_graph_build_pass", passes_.emplace("ir_graph_build_pass",
std::unique_ptr<AnalysisPass>(new IrGraphBuildPass)); std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
passes_.emplace("ir_analysis_compose_pass", passes_.emplace("memory_optimize_pass",
std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass)); std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
passes_.emplace( passes_.emplace(
"ir_params_sync_among_devices_pass", "ir_params_sync_among_devices_pass",
std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass)); std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
passes_.emplace(
"ir_graph_to_program_pass",
std::unique_ptr<IrGraphToProgramPass>(new IrGraphToProgramPass));
} }
} // namespace analysis } // namespace analysis
......
...@@ -18,8 +18,10 @@ if(APPLE) ...@@ -18,8 +18,10 @@ if(APPLE)
endif(APPLE) endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass set(inference_deps ${analysis_deps}
ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB}) paddle_inference_api paddle_fluid_api
analysis pass naive_executor
${GLOB_PASS_LIB})
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter) set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
...@@ -29,7 +31,8 @@ add_subdirectory(details) ...@@ -29,7 +31,8 @@ add_subdirectory(details)
cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder) cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager) cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor analysis_config paddle_pass_builder zero_copy_tensor
...@@ -44,7 +47,7 @@ if(WITH_TESTING) ...@@ -44,7 +47,7 @@ if(WITH_TESTING)
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
endif() endif()
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR}) ARGS --dirname=${WORD2VEC_MODEL_DIR})
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
...@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const { ...@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) { contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
model_dir_ = model_dir; model_dir_ = model_dir;
Update();
} }
contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file, contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
const std::string &params_file) { const std::string &params_file) {
prog_file_ = prog_file; prog_file_ = prog_file;
params_file_ = params_file; params_file_ = params_file;
Update();
} }
void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path, void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
const std::string &params_file_path) { const std::string &params_file_path) {
prog_file_ = prog_file_path; prog_file_ = prog_file_path;
params_file_ = params_file_path; params_file_ = params_file_path;
Update();
} }
void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) { int device_id) {
...@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, ...@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
memory_pool_init_size_mb_ = memory_pool_init_size_mb; memory_pool_init_size_mb_ = memory_pool_init_size_mb;
device_id_ = device_id; device_id_ = device_id;
#else #else
LOG(ERROR) << "Please compile with gpu to EnableGpu"; LOG(ERROR) << "Please compile with gpu to EnableGpu()";
use_gpu_ = false; use_gpu_ = false;
#endif #endif
Update();
}
void contrib::AnalysisConfig::DisableGpu() {
use_gpu_ = false;
Update();
} }
void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
#define CP_MEMBER(member__) member__ = other.member__; #define CP_MEMBER(member__) member__ = other.member__;
...@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { ...@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER(use_gpu_); CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_); CP_MEMBER(device_id_);
CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(enable_memory_optim_);
CP_MEMBER(memory_optim_force_update_);
// TensorRT releated. // TensorRT releated.
CP_MEMBER(use_tensorrt_); CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_workspace_size_);
...@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { ...@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
} }
#undef CP_MEMBER #undef CP_MEMBER
Update();
} }
void contrib::AnalysisConfig::EnableMKLDNN() { void contrib::AnalysisConfig::EnableMKLDNN() {
...@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() { ...@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
use_mkldnn_ = false; use_mkldnn_ = false;
#endif #endif
Update();
} }
void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
int max_batch_size, int max_batch_size,
int min_subgraph_size) { int min_subgraph_size) {
#ifdef PADDLE_WITH_CUDA
if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
return;
}
use_tensorrt_ = true; use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size; tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size; tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_min_subgraph_size_ = min_subgraph_size;
Update(); Update();
#else
LOG(ERROR)
<< "To use TensorRT engine, please compile inference lib with GPU first.";
#endif
} }
// TODO(Superjomn) refactor this, buggy.
void contrib::AnalysisConfig::Update() { void contrib::AnalysisConfig::Update() {
auto info = SerializeInfoCache(); auto info = SerializeInfoCache();
if (info == serialized_info_cache_) return; if (info == serialized_info_cache_) return;
if (use_gpu_) { // Transfer pass_builder and copy the existing compatible passes.
if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy); pass_builder_.reset(new GpuPassStrategy);
if (use_tensorrt_) {
// Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
} else { } else {
pass_builder_.reset(new CpuPassStrategy); pass_builder_.reset(new CpuPassStrategy);
} }
if (use_tensorrt_) {
if (!use_gpu_) {
LOG(ERROR)
<< "TensorRT engine is not available when EnableGpu() not actived.";
} else { } else {
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(pass_builder_.get())));
} else {
pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(pass_builder_.get())));
}
}
if (use_tensorrt_) {
const auto &passes = pass_builder_->AllPasses();
if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
std::end(passes)) {
// Append after the Affine_channel_conv_fuse pass. // Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
} }
...@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() { ...@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() {
#endif #endif
} }
if (enable_memory_optim_) {
pass_builder()->AppendAnalysisPass("memory_optimize_pass");
}
if (ir_debug_) { if (ir_debug_) {
pass_builder()->TurnOnDebug(); pass_builder()->TurnOnDebug();
} }
...@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() { ...@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() {
std::string contrib::AnalysisConfig::SerializeInfoCache() { std::string contrib::AnalysisConfig::SerializeInfoCache() {
std::stringstream ss; std::stringstream ss;
ss << model_dir_;
ss << prog_file_;
ss << params_file_;
ss << use_gpu_; ss << use_gpu_;
ss << device_id_;
ss << memory_pool_init_size_mb_; ss << memory_pool_init_size_mb_;
ss << use_tensorrt_; ss << use_tensorrt_;
ss << tensorrt_workspace_size_; ss << tensorrt_workspace_size_;
ss << tensorrt_max_batchsize_; ss << tensorrt_max_batchsize_;
ss << tensorrt_min_subgraph_size_;
ss << enable_memory_optim_;
ss << memory_optim_force_update_;
ss << use_mkldnn_; ss << use_mkldnn_;
for (auto &item : mkldnn_enabled_op_types_) ss << item;
ss << ";";
ss << model_from_memory_;
ss << enable_ir_optim_; ss << enable_ir_optim_;
ss << use_feed_fetch_ops_; ss << use_feed_fetch_ops_;
ss << ir_debug_; ss << ir_debug_;
ss << specify_input_name_;
ss << cpu_math_library_num_threads_;
return ss.str(); return ss.str();
} }
void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads( void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
int cpu_math_library_num_threads) { int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads; cpu_math_library_num_threads_ = cpu_math_library_num_threads;
Update();
} }
float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
...@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { ...@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif #endif
} }
void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
enable_memory_optim_ = true;
memory_optim_force_update_ = force_update_cache;
Update();
}
bool contrib::AnalysisConfig::enable_memory_optim() const {
return enable_memory_optim_;
}
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size, size_t prog_buffer_size,
const char *param_buffer, const char *param_buffer,
...@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, ...@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true; model_from_memory_ = true;
Update();
} }
} // namespace paddle } // namespace paddle
...@@ -24,18 +24,21 @@ ...@@ -24,18 +24,21 @@
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
DECLARE_bool(profile); DECLARE_bool(profile);
namespace paddle { namespace paddle {
...@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs, ...@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG(ERROR) << "fail to get fetches"; LOG(ERROR) << "fail to get fetches";
return false; return false;
} }
// Collect variable shapes for memory optimization.
if (need_collect_var_shapes_for_memory_optim()) {
CollectVarShapes();
}
VLOG(3) << "predict cost: " << timer.toc() << "ms"; VLOG(3) << "predict cost: " << timer.toc() << "ms";
// All the containers in the scope will be hold in inference, but the // All the containers in the scope will be hold in inference, but the
...@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseGPU(config_.use_gpu()); argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
argument_.SetModelFromMemory(config_.model_from_memory_); argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program // Analyze inference_program
if (!config_.model_dir().empty()) { if (!config_.model_dir().empty()) {
...@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
LOG(INFO) << "TensorRT subgraph engine is enabled";
argument_.SetUseTensorRT(true); argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
...@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() { ...@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
} }
if (config_.use_mkldnn_) { if (config_.use_mkldnn_) {
LOG(INFO) << "MKLDNN is enabled";
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
} }
auto passes = config_.pass_builder()->AllPasses(); auto passes = config_.pass_builder()->AllPasses();
if (!config_.ir_optim()) passes.clear(); if (!config_.ir_optim()) {
passes.clear();
LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
}
argument_.SetIrAnalysisPasses(passes); argument_.SetIrAnalysisPasses(passes);
argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get())); argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
Analyzer().Run(&argument_); Analyzer().Run(&argument_);
...@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() { ...@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() {
if (sub_scope_) { if (sub_scope_) {
scope_->DeleteScope(sub_scope_); scope_->DeleteScope(sub_scope_);
} }
// TODO(Superjomn) deduce the directory path.
std::string out_path = inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file());
if (need_collect_var_shapes_for_memory_optim()) {
SerializeBatchVarShapes(out_path);
}
} }
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() { std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
...@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() { ...@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
return std::unique_ptr<PaddlePredictor>(x); return std::unique_ptr<PaddlePredictor>(x);
} }
void AnalysisPredictor::CollectVarShapes() {
VLOG(4) << "Collecting var shapes";
if (batch_var_shapes_.size() >= max_shape_collect_count_) return;
std::map<std::string, std::vector<int>> var_shapes;
for (auto var_name : inference_program_->Block(0).LocalVarNames()) {
auto *var = sub_scope_->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var);
if (var->Type() == framework::VarTypeTrait<framework::LoDTensor>::kId ||
var->Type() == framework::VarTypeTrait<framework::Tensor>::kId) {
auto &tensor = var->Get<framework::LoDTensor>();
auto shape = framework::vectorize(tensor.dims());
var_shapes[var_name].assign(shape.begin(), shape.end());
}
}
batch_var_shapes_.push_back(var_shapes);
LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size()
<< " batch of var shapes for analysis";
}
void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) {
LOG(INFO) << "serialize batch var shapes to " << path;
std::ofstream file(path);
if (!file.is_open()) {
LOG(ERROR) << "failed to serialize the var shapes to " << path;
return;
}
// The sirialized data format:
// <tensor_name>:dim0,dim1,dim2,;
for (auto &batch : batch_var_shapes_) {
for (auto &ele : batch) {
file << ele.first << ":";
for (size_t i = 0; i < ele.second.size() - 1; i++) {
file << ele.second[i] << ",";
}
file << ele.second.back() << ";";
}
file << "\n";
}
}
bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_;
bool need = false;
// check if the cache exists
if (!config_.enable_memory_optim()) {
need = false;
} else if (config_.enable_memory_optim() &&
!inference::IsFileExists(inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file()))) {
need = true;
} else if (config_.enable_memory_optim() &&
config_.memory_optim_force_update_) {
need = true;
}
need_collect_var_shapes_ = need ? 1 : 0;
return need;
}
template <> template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>( std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
const contrib::AnalysisConfig &config) { const contrib::AnalysisConfig &config) {
......
...@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor {
void SetMkldnnThreadID(int tid); void SetMkldnnThreadID(int tid);
protected: protected:
// For memory optimization.
bool need_collect_var_shapes_for_memory_optim();
void CollectVarShapes();
void SerializeBatchVarShapes(const std::string &path);
bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program); bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope); bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
bool CreateExecutor(); bool CreateExecutor();
...@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor {
// A mutex help to make Clone thread safe. // A mutex help to make Clone thread safe.
std::mutex clone_mutex_; std::mutex clone_mutex_;
// For memory optimization.
const size_t max_shape_collect_count_{1000};
int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true.
std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
private: private:
// Some status here that help to determine the status inside the predictor. // Some status here that help to determine the status inside the predictor.
bool status_program_optimized_{false}; bool status_program_optimized_{false};
......
...@@ -16,8 +16,10 @@ ...@@ -16,8 +16,10 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <thread> // NOLINT #include <thread> // NOLINT
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_string(dirname, "", "dirname to tests."); DEFINE_string(dirname, "", "dirname to tests.");
...@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) { ...@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) {
} }
} }
TEST(AnalysisPredictor, memory_optim) {
AnalysisConfig config(FLAGS_dirname);
config.DisableGpu();
config.EnableMemoryOptim(true);
config.pass_builder()->TurnOnDebug();
auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
// 2. Dummy Input Data
int64_t data[4] = {1, 2, 3, 4};
PaddleTensor tensor;
tensor.shape = std::vector<int>({4, 1});
tensor.data.Reset(data, sizeof(data));
tensor.dtype = PaddleDType::INT64;
std::vector<PaddleTensor> inputs(4, tensor);
std::vector<PaddleTensor> output, output1;
{
// The first predictor help to cache the memory optimize strategy.
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
// Run several times to check the parameters are not reused by mistake.
for (int i = 0; i < 5; i++) {
ASSERT_TRUE(predictor->Run(inputs, &output));
}
}
{
output.clear();
// The second predictor to perform memory optimization.
config.EnableMemoryOptim(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
// Run with memory optimization
ASSERT_TRUE(predictor->Run(inputs, &output));
}
// Run native
ASSERT_TRUE(native_predictor->Run(inputs, &output1));
LOG(INFO) << "the output " << inference::DescribeTensor(output.front());
LOG(INFO) << "the native output "
<< inference::DescribeTensor(output1.front());
inference::CompareResult(output, output1);
}
} // namespace paddle } // namespace paddle
#!/bin/bash
set -x set -x
PADDLE_ROOT=$1 PADDLE_ROOT=$1
TURN_ON_MKL=$2 # use MKL or Openblas TURN_ON_MKL=$2 # use MKL or Openblas
......
...@@ -15,7 +15,10 @@ ...@@ -15,7 +15,10 @@
#pragma once #pragma once
#include <glog/logging.h> #include <glog/logging.h>
#include <fstream>
#if !defined(_WIN32)
#include <sys/time.h>
#endif
#include <algorithm> #include <algorithm>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <iterator> #include <iterator>
...@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) { ...@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
return true; return true;
} }
static std::string DescribeTensor(const PaddleTensor &tensor) { static std::string DescribeTensor(const PaddleTensor &tensor,
int max_num_of_data = 15) {
std::stringstream os; std::stringstream os;
os << "Tensor [" << tensor.name << "]\n"; os << "Tensor [" << tensor.name << "]\n";
os << " - type: "; os << " - type: ";
...@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid, ...@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
} }
} }
static bool IsFileExists(const std::string &path) {
std::ifstream file(path);
bool exists = file.is_open();
file.close();
return exists;
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -192,6 +192,13 @@ struct AnalysisConfig { ...@@ -192,6 +192,13 @@ struct AnalysisConfig {
*/ */
bool model_from_memory() const { return model_from_memory_; } bool model_from_memory() const { return model_from_memory_; }
/** Turn on memory optimize
* NOTE still in development, will release latter.
*/
void EnableMemoryOptim(bool force_update_cache = false);
/** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const;
friend class ::paddle::AnalysisPredictor; friend class ::paddle::AnalysisPredictor;
/** NOTE just for developer, not an official API, easily to be broken. /** NOTE just for developer, not an official API, easily to be broken.
...@@ -232,6 +239,10 @@ struct AnalysisConfig { ...@@ -232,6 +239,10 @@ struct AnalysisConfig {
// subgraph, 3 as default value. // subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3}; int tensorrt_min_subgraph_size_{3};
// memory reuse related.
bool enable_memory_optim_{false};
bool memory_optim_force_update_{false};
bool use_mkldnn_{false}; bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_; std::unordered_set<std::string> mkldnn_enabled_op_types_;
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include <glog/logging.h> #include <glog/logging.h>
namespace paddle { namespace paddle {
...@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() { ...@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet"; LOG(ERROR) << "GPU not support MKLDNN yet";
} }
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass);
}
} // namespace paddle } // namespace paddle
...@@ -45,6 +45,9 @@ class PaddlePassBuilder { ...@@ -45,6 +45,9 @@ class PaddlePassBuilder {
/** Delete all the passes that has type `pass_type`. */ /** Delete all the passes that has type `pass_type`. */
void DeletePass(const std::string &pass_type); void DeletePass(const std::string &pass_type);
/** Append an analysis pass. */
void AppendAnalysisPass(const std::string &pass);
/** Visualize the computation graph after each pass by generating a DOT /** Visualize the computation graph after each pass by generating a DOT
* language file, one can draw them with the Graphviz toolkit. * language file, one can draw them with the Graphviz toolkit.
*/ */
...@@ -54,8 +57,18 @@ class PaddlePassBuilder { ...@@ -54,8 +57,18 @@ class PaddlePassBuilder {
std::string DebugString(); std::string DebugString();
const std::vector<std::string> &AllPasses() const { return passes_; } const std::vector<std::string> &AllPasses() const { return passes_; }
std::vector<std::string> AnalysisPasses() const {
auto passes = analysis_passes_;
// To make sure the ir_graph_to_program should be the last pass so any
// modication of IR will persist to the program.
passes.push_back("ir_graph_to_program_pass");
return passes;
}
protected: protected:
std::vector<std::string> analysis_passes_{
{"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass"}};
std::vector<std::string> passes_; std::vector<std::string> passes_;
}; };
...@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder {
/** The MKLDNN control exists in both CPU and GPU mode, because there can be /** The MKLDNN control exists in both CPU and GPU mode, because there can be
* still some CPU kernels running in CPU mode. * still some CPU kernels running in CPU mode.
*/ */
virtual void EnableMKLDNN() = 0; virtual void EnableMKLDNN() {}
bool use_gpu() const { return use_gpu_; } bool use_gpu() const { return use_gpu_; }
...@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder { ...@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder {
protected: protected:
bool use_gpu_{false}; bool use_gpu_{false};
bool use_mkldnn_{false};
}; };
/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode. /** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
...@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy { ...@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy {
use_gpu_ = false; use_gpu_ = false;
} }
explicit CpuPassStrategy(const CpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {}
virtual ~CpuPassStrategy() = default; virtual ~CpuPassStrategy() = default;
void EnableMKLDNN() override { void EnableMKLDNN() override {
// TODO(Superjomn) Consider the way to mix CPU with GPU. // TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (!use_mkldnn_) {
passes_.insert(passes_.begin(), "mkldnn_placement_pass"); passes_.insert(passes_.begin(), "mkldnn_placement_pass");
for (auto &pass : for (auto &pass : std::vector<std::string>(
std::vector<std::string>({"depthwise_conv_mkldnn_pass", // {"depthwise_conv_mkldnn_pass", //
"conv_bias_mkldnn_fuse_pass", // "conv_bias_mkldnn_fuse_pass", //
"conv3d_bias_mkldnn_fuse_pass", // "conv3d_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", // "conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass"})) { "conv_elementwise_add_mkldnn_fuse_pass"})) {
passes_.push_back(pass); passes_.push_back(pass);
} }
}
use_mkldnn_ = true;
#else
use_mkldnn_ = false;
#endif #endif
} }
CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
}; };
/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode. /** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
...@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy { ...@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy {
use_gpu_ = true; use_gpu_ = true;
} }
GpuPassStrategy(const GpuPassStrategy &other) explicit GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) { : PassStrategy(other.AllPasses()) {
use_gpu_ = true; use_gpu_ = true;
} }
......
...@@ -19,7 +19,7 @@ endfunction() ...@@ -19,7 +19,7 @@ endfunction()
function(inference_analysis_api_test target install_dir filename) function(inference_analysis_api_test target install_dir filename)
inference_analysis_test(${target} SRCS ${filename} inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt) ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction() endfunction()
...@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 ...@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM # normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL) inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
# small DAM # small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
......
...@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data, ...@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
std::string turn_mask_pre = "turn_mask_"; std::string turn_mask_pre = "turn_mask_";
auto one_batch = data->NextBatch(); auto one_batch = data->NextBatch();
PADDLE_ENFORCE(!one_batch.response.empty());
int size = one_batch.response[0].size(); int size = one_batch.response[0].size();
CHECK_EQ(size, kMaxTurnLen); CHECK_EQ(size, kMaxTurnLen);
// turn tensor assignment // turn tensor assignment
...@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) { ...@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) {
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg), TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads); input_slots_all, &outputs, FLAGS_num_threads);
...@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) { ...@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) {
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all); reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
} }
// Compare result of NativeConfig and AnalysisConfig with memory optimization.
TEST(Analyzer_dam, compare_with_memory_optim) {
// The small dam will core in CI, but works in local.
if (FLAGS_max_turn_num == 9) {
contrib::AnalysisConfig cfg, cfg1;
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
// Run the first time to force to update memory cache
SetConfig(&cfg);
cfg.EnableMemoryOptim(true);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
// Run second time to use the memory cache and perform memory optimization.
SetConfig(&cfg1);
cfg1.EnableMemoryOptim();
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
input_slots_all);
}
}
TEST(Analyzer_dam, compare) { compare(); } TEST(Analyzer_dam, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); } TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif #endif
......
...@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) { ...@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_Text_Classification, profile) { TEST(Analyzer_Text_Classification, profile) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.pass_builder()->TurnOnDebug();
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) { ...@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) {
TEST(Analyzer_Text_Classification, compare) { TEST(Analyzer_Text_Classification, compare) {
AnalysisConfig cfg; AnalysisConfig cfg;
SetConfig(&cfg); SetConfig(&cfg);
cfg.EnableMemoryOptim();
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all); SetInput(&input_slots_all);
......
...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <gtest/gtest.h>
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h" #include "paddle/fluid/inference/tests/api/tester_helper.h"
...@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) { ...@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) {
FLAGS_infer_model + "/__params__"); FLAGS_infer_model + "/__params__");
cfg->DisableGpu(); cfg->DisableGpu();
cfg->SwitchIrDebug(); cfg->SwitchIrDebug();
cfg->SwitchSpecifyInputNames(); cfg->SwitchSpecifyInputNames(false);
// TODO(TJ): fix fusion gru // TODO(TJ): fix fusion gru
cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
} }
...@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) { ...@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) {
if (use_mkldnn) { if (use_mkldnn) {
cfg.EnableMKLDNN(); cfg.EnableMKLDNN();
} }
// cfg.pass_builder()->TurnOnDebug();
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all; std::vector<std::vector<PaddleTensor>> input_slots_all;
...@@ -103,8 +105,7 @@ void profile(bool use_mkldnn = false) { ...@@ -103,8 +105,7 @@ void profile(bool use_mkldnn = false) {
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
CHECK_EQ(numel, refer.data.size()); CHECK_EQ(numel, refer.data.size());
for (size_t i = 0; i < numel; ++i) { for (size_t i = 0; i < numel; ++i) {
CHECK_LT( EXPECT_NEAR(static_cast<float *>(output.data.data())[i], refer.data[i],
fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
1e-5); 1e-5);
} }
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#pragma once #pragma once
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <algorithm> #include <algorithm>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
...@@ -28,9 +29,8 @@ ...@@ -28,9 +29,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h" #include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/config_printer.h" #include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/inference/utils/benchmark.h" #include "paddle/fluid/inference/utils/benchmark.h"
...@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs, ...@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
float *pdata = static_cast<float *>(out.data.data()); float *pdata = static_cast<float *>(out.data.data());
float *pdata_ref = static_cast<float *>(ref_out.data.data()); float *pdata_ref = static_cast<float *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) { for (size_t j = 0; j < size; ++j) {
EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy); CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
} }
break; break;
} }
......
...@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) {
} }
} }
TEST(TensorRT_mobilenet, profile) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
profile(model_dir, true, false);
}
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -11,8 +11,8 @@ ...@@ -11,8 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#pragma once
#include <fstream> #include <fstream>
#include <iostream> #include <iostream>
#include <string> #include <string>
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
using namespace paddle::inference; using namespace paddle::inference; // NOLINT
TEST(Benchmark, basic) { TEST(Benchmark, basic) {
Benchmark benchmark; Benchmark benchmark;
benchmark.SetName("key0"); benchmark.SetName("key0");
......
...@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase { ...@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase {
<< out_name; << out_name;
auto &feed_list = feed_var->Get<framework::FeedFetchList>(); auto &feed_list = feed_var->Get<framework::FeedFetchList>();
PADDLE_ENFORCE_LT(static_cast<size_t>(col), feed_list.size());
auto &feed_item = feed_list.at(static_cast<size_t>(col)); auto &feed_item = feed_list.at(static_cast<size_t>(col));
auto *out_item = out_var->GetMutable<framework::FeedFetchType>(); auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
......
...@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt, ...@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt,
std::cerr << style << Sprintf(fmt, args...) << reset(); std::cerr << style << Sprintf(fmt, args...) << reset();
} }
template <typename... Args>
static void PrettyLogInfo(const char *fmt, const Args &... args) {
PrettyLogEndl(Style::info(), fmt, args...);
}
template <typename... Args>
static void PrettyLogDetail(const char *fmt, const Args &... args) {
PrettyLogEndl(Style::detail(), fmt, args...);
}
template <typename... Args>
static void PrettyLogH1(const char *fmt, const Args &... args) {
PrettyLogEndl(Style::H1(), fmt, args...);
}
template <typename... Args>
static void PrettyLogH2(const char *fmt, const Args &... args) {
PrettyLogEndl(Style::H2(), fmt, args...);
}
} // namespace string } // namespace string
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册