未验证 提交 885c4e57 编写于 作者: Y Yan Chunwei 提交者: GitHub

fea/infer memory optim2 (#14953)

上级 6597ccb0
......@@ -15,6 +15,7 @@
#include "paddle/fluid/framework/ir/fc_fuse_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
......
......@@ -18,8 +18,10 @@ limitations under the License. */
#include <fstream>
#include <iosfwd>
#include <ostream>
#include <stack>
#include <unordered_map>
#include <unordered_set>
#include "paddle/fluid/framework/ir/graph_traits.h"
DEFINE_string(print_sub_graph_dir, "",
"FLAGS_print_sub_graph_dir is used "
......@@ -41,7 +43,7 @@ void SortHelper(
}
}
VLOG(3) << "topology sort insert: " << node->Name()
VLOG(5) << "topology sort insert: " << node->Name() << " "
<< reinterpret_cast<void *>(node) << " input " << node->inputs.size();
ret->push_back(node);
}
......@@ -99,12 +101,13 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
return ret;
}
// Build operator inlink edge table.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
for (auto &n : graph.Nodes()) {
if (n->NodeType() != ir::Node::Type::kOperation) continue;
if (!n->IsOp()) continue;
if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>();
}
......@@ -121,6 +124,119 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
return adj_list;
}
// Build operator outlink edge table.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationOutAdjList(
const Graph &graph) {
std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
for (auto &n : graph.Nodes()) {
if (!n->IsOp()) continue;
if (adj_list.find(n) == adj_list.end()) {
adj_list[n] = std::unordered_set<ir::Node *>();
}
for (auto &var : n->outputs) {
for (auto &adj_n : var->outputs) {
PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
VLOG(40) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
<< " -> " << n->Name() << reinterpret_cast<void *>(n)
<< " via " << var->Name() << reinterpret_cast<void *>(var);
adj_list[n].insert(adj_n);
}
}
}
return adj_list;
}
std::vector<ir::Node *> OpDFSSort(const Graph &graph) {
auto edge_table = BuildOperationOutAdjList(graph);
std::stack<Node *> stack;
for (auto &ele : edge_table) {
if (ele.first->inputs.empty()) {
// find the input ops (those without input vars)
stack.push(ele.first);
} else {
// find the ops with only persistable vars as inputs.
bool all_persistable = true;
for (auto *input : ele.first->inputs) {
if (!(input->IsVar() && input->Var() && input->Var()->Persistable())) {
all_persistable = false;
}
}
if (all_persistable) {
stack.push(ele.first);
}
}
}
std::vector<Node *> res;
// start from the feed op and DFS
std::unordered_set<Node *> unique_set;
while (!stack.empty()) {
// will start from the last feed by default.
auto cur = stack.top();
stack.pop();
unique_set.insert(cur);
res.push_back(cur);
for (auto *op : edge_table[cur]) {
if (!unique_set.count(op)) {
stack.push(op);
}
}
}
return res;
}
std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph) {
std::vector<ir::Node *> nodes;
std::unordered_map<Node *, int> in_degree;
auto set_out_ops_ready = [&](Node *var) {
for (auto *op : var->outputs) {
--in_degree[op];
}
};
// build in_degree
for (auto *node : graph.Nodes()) {
if (node->IsOp()) {
in_degree[node] += node->inputs.size();
} else if (node->IsVar() && node->inputs.empty()) {
// put all the inputs of the whole graph ready.
set_out_ops_ready(node);
}
}
std::deque<Node *> op_queue;
// first visit
for (auto &node : OpDFSSort(graph)) {
if (node->IsOp()) {
op_queue.push_back(node);
}
}
// traverse the graph
int num_ops = op_queue.size();
while (num_ops) {
for (auto it = op_queue.begin(); it != op_queue.end(); it++) {
auto *&cur_op = *it;
if (!cur_op || in_degree[cur_op] > 0) continue;
// visit this node
// put all the output var of this op valid.
for (auto *out_var : cur_op->outputs) {
if (!out_var) continue;
set_out_ops_ready(out_var);
}
VLOG(8) << "visit " << cur_op->Name();
nodes.push_back(cur_op);
cur_op = nullptr;
num_ops--;
}
}
return nodes;
}
size_t GraphNum(const Graph &graph) {
std::unordered_set<ir::Node *> nodes(graph.Nodes());
std::unordered_set<ir::Node *> visited_nodes;
......@@ -203,6 +319,29 @@ size_t GraphNum(const Graph &graph) {
return graph_count;
}
void CleanIndividualNodes(Graph *graph) {
std::unordered_set<Node *> nodes2rm;
for (auto *node : graph->Nodes()) {
if (node->inputs.empty() && node->outputs.empty()) {
nodes2rm.insert(node);
}
}
for (auto *node : nodes2rm) {
graph->RemoveNode(node);
}
}
std::vector<Node *> TopologyVarientSort(const Graph &graph,
SortKind sort_kind) {
switch (sort_kind) {
case SortKind::TS:
return framework::ir::TopologySortOperations(graph);
default:
return framework::ir::TopologyDfsSortOperations(graph);
}
}
} // namespace ir
} // namespace framework
} // namespace paddle
......@@ -34,6 +34,23 @@ size_t GraphNum(const Graph &graph);
// `graph` cannot contain circle.
std::vector<ir::Node *> TopologySortOperations(const Graph &graph);
// Topological sort, but try to DFS.
std::vector<ir::Node *> TopologyDfsSortOperations(const Graph &graph);
// Different kinds to sort the operators in a graph to a sequence.
enum class SortKind {
// Topological Search
TS = 0,
// Topological and Depth First Search
TDFS
};
// Several kinds of topological sort.
std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
// Clean the nodes that doesn't connect to others.
void CleanIndividualNodes(Graph *graph);
// Build an adjacency list of operations for the `graph`.
std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
const Graph &graph);
......
......@@ -20,7 +20,6 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
......@@ -29,6 +28,14 @@ namespace ir {
std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
std::unique_ptr<Graph> graph) const {
// Remove the unneeded variables after memory optimization.
std::unordered_set<std::string> vars2remove;
if (graph->Has(kGraphToProgramVarsToRemove)) {
vars2remove = graph->Get<std::unordered_set<std::string>>(
kGraphToProgramVarsToRemove);
VLOG(2) << "graph to program remove " << vars2remove.size() << " nodes";
}
ProgramDesc& program = Get<ProgramDesc>("program");
std::unique_ptr<proto::ProgramDesc> program_pb(
......@@ -40,25 +47,35 @@ std::unique_ptr<Graph> GraphToProgramPass::ApplyImpl(
std::unordered_set<std::string> visited_vars;
for (ir::Node* n : graph->Nodes()) {
if (n->IsVar()) {
if (n->Var() && visited_vars.count(n->Var()->Name()) == 0) {
if (n->Var() && visited_vars.count(n->Var()->Name()) == 0 &&
!vars2remove.count(n->Var()->Name())) {
visited_vars.insert(n->Var()->Name());
block->add_vars()->MergeFrom(*n->Var()->Proto());
}
}
}
block->clear_ops();
std::vector<ir::Node*> nodes = TopologySortOperations(*graph);
std::vector<ir::Node*> nodes;
if (Has(kGraphToProgramSortKind)) {
// Inference Memory Optimize relays on this branch.
int sort_kind = Get<int>(kGraphToProgramSortKind);
nodes = TopologyVarientSort(
*graph, static_cast<framework::ir::SortKind>(sort_kind));
} else {
nodes = TopologySortOperations(*graph);
}
for (ir::Node* n : nodes) {
if (!n->Op()) {
continue;
}
if (!n->Op()) continue;
block->add_ops()->MergeFrom(*n->Op()->Proto());
}
program.CopyFrom(*program_pb);
return graph;
}
} // namespace ir
} // namespace framework
} // namespace paddle
......
......@@ -20,6 +20,10 @@ namespace paddle {
namespace framework {
namespace ir {
const char kGraphToProgramVarsToRemove[] =
"__graph_to_program_vars_to_remove__";
const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__";
class GraphToProgramPass : public Pass {
protected:
std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const override;
......
......@@ -135,4 +135,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes(
} // namespace paddle
REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass)
.RequirePassAttr(paddle::framework::ir::kGraphVizPath);
.RequirePassAttr(paddle::framework::ir::kGraphVizPath);
\ No newline at end of file
......@@ -64,7 +64,7 @@ class Node {
std::string Name() const { return name_; }
VarDesc* Var() {
VarDesc* Var() const {
PADDLE_ENFORCE(IsVar());
return var_desc_.get();
}
......
......@@ -50,8 +50,8 @@ void NaiveExecutor::Run() {
"running Paddle Inference";
#endif // PADDLE_ON_INFERENCE
for (auto &op : ops_) {
VLOG(3) << std::this_thread::get_id() << " run " << op->Type()
<< " on scope " << scope_;
VLOG(4) << std::this_thread::get_id() << " run "
<< op->DebugStringEx(scope_) << " on scope " << scope_;
op->SetIsCalledByExecutor(false);
op->Run(*scope_, place_);
}
......@@ -69,10 +69,12 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
anc = anc->parent();
}
int num_vars = 0;
for (auto &var : global_block.AllVars()) {
if (var->Name() == framework::kEmptyVarName) {
continue;
}
num_vars++;
if (persistable == var->Persistable()) {
if (persistable) {
......@@ -90,6 +92,7 @@ void NaiveExecutor::CreateVariables(const ProgramDesc &desc, int block_id,
}
}
}
VLOG(4) << "naive executor create " << num_vars << " vars";
}
void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
......
......@@ -18,6 +18,7 @@ cc_library(analysis SRCS
analyzer.cc
analysis_pass
DEPS ${analysis_deps} analysis_helper
${INFER_IR_PASSES}
)
cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
......
......@@ -15,8 +15,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h"
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
......@@ -24,13 +24,16 @@ namespace analysis {
Analyzer::Analyzer() {}
void Analyzer::Run(Argument *argument) { RunIrAnalysis(argument); }
void Analyzer::Run(Argument *argument) { RunAnalysis(argument); }
void Analyzer::RunIrAnalysis(Argument *argument) {
std::vector<std::string> passes({"ir_analysis_compose_pass"});
for (auto &pass : passes) {
PassRegistry::Global().Retreive(pass)->Run(argument);
void Analyzer::RunAnalysis(Argument *argument) {
PADDLE_ENFORCE(argument->analysis_passes_valid(),
"analsis_passes is not valid in the argument.");
for (auto &pass : argument->analysis_passes()) {
string::PrettyLogH1("--- Running analysis [%s]", pass);
auto *ptr = PassRegistry::Global().Retreive(pass);
PADDLE_ENFORCE_NOT_NULL(ptr, "no analysis pass called %s", pass);
ptr->Run(argument);
}
}
......
......@@ -54,7 +54,7 @@ class Analyzer final {
DISABLE_COPY_AND_ASSIGN(Analyzer);
protected:
void RunIrAnalysis(Argument* argument);
void RunAnalysis(Argument* argument);
};
} // namespace analysis
......
......@@ -32,6 +32,8 @@ TEST(Analyzer, analysis_without_tensorrt) {
argument.SetModelDir(FLAGS_inference_model_dir);
argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
argument.SetUseGPU(false);
argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass"});
Analyzer analyser;
analyser.Run(&argument);
......@@ -44,6 +46,8 @@ TEST(Analyzer, analysis_with_tensorrt) {
argument.SetModelDir(FLAGS_inference_model_dir);
argument.SetIrAnalysisPasses({"infer_clean_graph_pass"});
argument.SetUseGPU(false);
argument.SetAnalysisPasses({"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass"});
Analyzer analyser;
analyser.Run(&argument);
......
......@@ -110,16 +110,20 @@ struct Argument {
// The overall Scope to work on.
DECL_ARGUMENT_UNIQUE_FIELD(scope, Scope, framework::Scope);
// The default program, loaded from disk.
DECL_ARGUMENT_UNIQUE_FIELD(main_program, MainProgram, framework::ProgramDesc);
// The ir passes to perform in analysis phase.
DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses,
std::vector<std::string>);
DECL_ARGUMENT_FIELD(analysis_passes, AnalysisPasses,
std::vector<std::string>);
// Pass a set of op types to enable its mkldnn kernel
DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
std::unordered_set<std::string>);
// Passed from config.
DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int);
DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool);
......@@ -127,6 +131,13 @@ struct Argument {
DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int);
DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
// Memory optimized related.
DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
DECL_ARGUMENT_FIELD(memory_optim_force_update, MemoryOptimForceUpdate, bool);
// Indicate which kind of sort algorithm is used for operators, the memory
// optimization relays on the sort algorithm.
DECL_ARGUMENT_FIELD(memory_optim_sort_kind, MemoryOptimSortKind, int);
// The program transformed by IR analysis phase.
DECL_ARGUMENT_UNIQUE_FIELD(ir_analyzed_program, IrAnalyzedProgram,
framework::proto::ProgramDesc);
......
......@@ -28,6 +28,13 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/port.h"
#ifdef _WIN32
#define GCC_ATTRIBUTE(attr__) ;
#else
#define GCC_ATTRIBUTE(attr__) __attribute__((attr__));
#endif
#define __SHOULD_USE_RESULT__ GCC_ATTRIBUTE(warn_unused_result)
namespace paddle {
namespace inference {
namespace analysis {
......
......@@ -83,6 +83,7 @@ std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
PADDLE_ENFORCE(graph.get());
// Apply all the passes
for (const auto &pass : passes_) {
if (pass->Type() == "graph_viz_pass") continue;
PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
graph = pass->Apply(std::move(graph));
}
......
cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc)
if (TENSORRT_FOUND)
if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller)
set(analysis_deps ${analysis_deps}
......
......@@ -413,7 +413,6 @@ void SubGraphFuser::ReplaceNodesWithSubGraphs() {
auto subgraphs = SubgraphDetector(graph_, node_inside_subgraph_teller_)();
for (auto &subgraph : subgraphs) {
if (subgraph.size() <= (size_t)min_subgraph_size_) continue;
LOG(INFO) << "detect a subgraph size " << subgraph.size();
std::unordered_set<Node *> subgraph_uniq(subgraph.begin(), subgraph.end());
// replace this sub-graph with the first node. Two steps: 1. Create a Block
// Node that contains this subgraph 2. Mark the nodes inside the sub-graph
......
......@@ -21,6 +21,7 @@
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
......@@ -77,6 +78,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
framework::BlockDesc block_desc(nullptr, &block_proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
string::PrettyLogDetail("--- detect a sub-graph with %d nodes",
subgraph.size());
for (auto *node : subgraph) {
auto *op = block_desc.AppendOp();
*op->Proto() = *node->Op()->Proto();
......
cc_library(ir_graph_build_pass SRCS ir_graph_build_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(ir_analysis_pass SRCS ir_analysis_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(memory_optim_pass SRCS memory_optimize_pass.cc DEPS analysis_pass)
cc_library(ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc DEPS analysis_pass argument ir_pass_manager)
cc_library(analysis_passes SRCS passes.cc DEPS ir_graph_build_pass ir_analysis_pass ir_params_sync_among_devices_pass)
cc_library(ir_graph_to_program_pass SRCS ir_graph_to_program_pass.cc DEPS analysis_pass graph_to_program_pass)
cc_library(analysis_passes SRCS passes.cc DEPS
ir_graph_build_pass
ir_analysis_pass
ir_params_sync_among_devices_pass
memory_optim_pass
ir_graph_to_program_pass
)
set(analysis_deps ${analysis_deps}
ir_graph_build_pass
ir_analysis_pass
analysis_passes
subgraph_detector
CACHE INTERNAL "")
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
namespace paddle {
......@@ -31,9 +32,18 @@ void IrAnalysisPass::RunImpl(Argument* argument) {
IRPassManager the_ir_manager(argument);
graph = the_ir_manager.Apply(std::move(graph));
PADDLE_ENFORCE_GT(graph->Nodes().size(), 0);
argument->SetIrAnalyzedProgram(new framework::proto::ProgramDesc(
the_ir_manager.AcquireProgram(&graph, argument->main_program())));
argument->SetMainGraph(graph.release());
CollectFusionStatis(argument);
}
void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
LOG(INFO) << "argument has no fuse statis";
return;
}
argument->SetFusionStatis(
argument->main_graph().Get<Argument::fusion_statis_t>(
framework::ir::kFuseStatisAttr));
}
std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
......
......@@ -29,6 +29,9 @@ namespace analysis {
class IrAnalysisPass : public AnalysisPass {
public:
void RunImpl(Argument* argument) override;
void CollectFusionStatis(Argument* argument);
std::string repr() const override;
};
......
......@@ -12,49 +12,32 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h"
#include <string>
#include <vector>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/analysis/ir_pass_manager.h"
#include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
#include "paddle/fluid/string/pretty_log.h"
#include "paddle/fluid/framework/program_desc.h"
namespace paddle {
namespace inference {
namespace analysis {
void IrAnalysisComposePass::RunImpl(Argument *argument) {
ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
ApplyIrPasses(argument);
CollectFusionStatis(argument);
}
std::string IrAnalysisComposePass::repr() const {
return "ir-analysis-compose-pass";
}
void IrGraphToProgramPass::RunImpl(Argument *argument) {
auto pass =
framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) {
std::vector<std::string> passes({
"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass",
});
for (const auto &pass : passes) {
VLOG(2) << "Run pass " << pass;
auto *the_pass = PassRegistry::Global().Retreive(pass);
the_pass->Run(argument);
if (argument->memory_optim_sort_kind_valid()) {
pass->Set(framework::ir::kGraphToProgramSortKind,
new int(argument->memory_optim_sort_kind()));
}
}
void IrAnalysisComposePass::CollectFusionStatis(Argument *argument) {
if (!argument->main_graph().Has(framework::ir::kFuseStatisAttr)) {
LOG(INFO) << "argument has no fuse statis";
return;
}
argument->SetFusionStatis(
argument->main_graph().Get<Argument::fusion_statis_t>(
framework::ir::kFuseStatisAttr));
std::unique_ptr<Graph> graph(argument->main_graph_ptr());
framework::ProgramDesc desc(argument->main_program());
pass->SetNotOwned("program", &desc);
auto thegraph = pass->Apply(std::move(graph));
thegraph.release(); // the argument still own the graph.
argument->SetIrAnalyzedProgram(
new framework::proto::ProgramDesc(*desc.Proto()));
}
} // namespace analysis
......
......@@ -14,31 +14,17 @@
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/passes.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* The analysis pass to run a list of IR passes (like a function call).
* Currently, it should be the first pass of analysis phase.
*/
class IrAnalysisComposePass : public AnalysisPass {
class IrGraphToProgramPass : public AnalysisPass {
public:
void RunImpl(Argument* argument) override;
std::string repr() const override;
void RunImpl(Argument *argument) override;
private:
void ApplyIrPasses(Argument* argument);
void CollectFusionStatis(Argument* argument);
// Assign a Scope for IR passes to modify the weights.
void AssignScopeToModify(Argument* argument);
std::string repr() const override { return "ir-graph-to-param-pass"; }
};
} // namespace analysis
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include <algorithm>
#include <fstream>
#include <limits>
#include <map>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/ir/graph_helper.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
#include "paddle/fluid/framework/ir/graph_to_program_pass.h"
#include "paddle/fluid/framework/ir/graph_traits.h"
#include "paddle/fluid/inference/analysis/helper.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace inference {
namespace analysis {
using framework::ir::Graph;
using framework::ir::Node;
using framework::ir::TopologyVarientSort;
using space_table_t = MemoryOptimizePass::space_table_t;
// Collect the lifecycles of the tensors.
// Traverse the graph in topological order.
// The traversal order also affect the lifecycles, so different sort_kind is
// used.
void MemoryOptimizePass::CollectLifeCycle(
std::unordered_map<std::string, lifecycle_t>* lifecycles,
int sort_kind) const {
max_lifecycle_ = 0;
for (auto* op_node : framework::ir::TopologyVarientSort(
*graph_, static_cast<framework::ir::SortKind>(sort_kind))) {
if (!op_node->IsOp()) continue;
auto reads = op_node->inputs;
auto writes = op_node->outputs;
std::vector<Node*> requires(reads.begin(), reads.end());
requires.insert(requires.end(), writes.begin(), writes.end());
// Disable reuse of feed variables.
if (op_node->Name() == "feed") {
for (auto* node : op_node->outputs) {
auto var = node->Name();
lifecycles->emplace(var,
std::make_pair(0, std::numeric_limits<int>::max()));
}
} else {
// Normal operators.
for (const Node* node : requires) {
if (node->Var()->Persistable()) continue;
std::string var = node->Name();
if (!lifecycles->count(var)) {
(*lifecycles)[var] = std::make_pair(max_lifecycle_, max_lifecycle_);
} else {
(*lifecycles)[var].second =
std::max(max_lifecycle_, lifecycles->at(var).second); // max()
}
}
}
++max_lifecycle_;
}
}
// TODO(Superjomn) Make this a general help method.
int DataTypeToSpace(framework::proto::VarType_Type type) {
switch (type) {
case framework::proto::VarType_Type_BOOL:
return sizeof(bool);
case framework::proto::VarType_Type_FP32:
return sizeof(float);
case framework::proto::VarType_Type_INT32:
return sizeof(int32_t);
case framework::proto::VarType_Type_INT64:
return sizeof(int64_t);
default:
PADDLE_THROW("Unknown data type");
}
}
// Collect the memory size of the tensors.
void MemoryOptimizePass::CollectVarMemorySize(
const std::unordered_map<std::string, size_t>& batch_var_ave_dim,
std::unordered_map<std::string, Node*>* tensor_nodes,
space_table_t* space_table) const {
// Collect tensors from graph.
for (auto* node : graph_->Nodes()) {
if (node->IsVar() &&
node->Var()->GetType() ==
framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
// Parameters will not be reused.
if (node->Var()->Persistable()) continue;
(*tensor_nodes)[node->Name()] = node;
(*space_table)[node->Name()] =
DataTypeToSpace(node->Var()->GetDataType()) *
batch_var_ave_dim.at(node->Name());
}
}
}
// Find a sutable (big enough but smallest to avoid memory waste).
//
// Args:
// @tensor_nodes: the tensor nodes in the ir::Graph.
// @free_existing_tensors: the allocated tensor and are free.
// @space_table: the memory space of tensors.
// @tensor2use: the tensor that requires memory.
//
// Returns:
// true if found some existing tensor to reuse.
// false if no sutable tensor to reuse, one need to allocate a new tensor for
// this requirement.
// The suitable tensor for reuse is one that is approximately equal to the
// memory demand.
bool FindSuitableTensorToReuse(
const std::string& tensor, int space_required,
const std::unordered_map<std::string, Node*>& tensor_nodes,
std::unordered_set<std::string>* free_existing_tensors,
const space_table_t& space_table,
const std::vector<std::unordered_set<std::string>>& var_clusters,
std::string* tensor2use) __SHOULD_USE_RESULT__;
bool FindSuitableTensorToReuse(
const std::string& tensor, int space_required,
const std::unordered_map<std::string, Node*>& tensor_nodes,
std::unordered_set<std::string>* free_existing_tensors,
const space_table_t& space_table,
const std::vector<std::unordered_set<std::string>>& var_clusters,
std::string* tensor2use) {
std::pair<std::string, size_t> best_fit;
best_fit.second = std::numeric_limits<int>::max();
VLOG(5) << "Split Tensors to " << var_clusters.size() << " clusters";
// find the cluster this var belongs to.
const std::unordered_set<std::string>* cluster = nullptr;
for (const auto& c : var_clusters) {
if (c.count(tensor)) {
cluster = &c;
break;
}
}
PADDLE_ENFORCE_NOT_NULL(cluster,
"something wrong in memory optimization, the "
"variable %s not in the clusters.",
tensor);
for (auto& candidate : *free_existing_tensors) {
// This is not a temporary tensor.
if (!space_table.count(candidate)) continue;
// Not in the same cluster.
if (!cluster->count(candidate)) continue;
size_t space = space_table.at(candidate);
size_t space_diff = std::abs<size_t>(space - space_required);
if (space_diff < best_fit.second) {
best_fit.first = candidate;
best_fit.second = space_diff;
}
}
if (best_fit.second < std::numeric_limits<int>::max()) {
*tensor2use = best_fit.first;
return true;
}
return false;
}
// Allocate new tensor instead of reusing the existing one.
void AllocateNewTensor(
const std::string& name, size_t space_required,
const std::unordered_map<std::string, Node*>& tensor_nodes,
std::unordered_set<std::string>* free_existing_tensors,
space_table_t* space_table,
std::unordered_map<std::string, std::string>* reuse_table) {
// The newly born tensor is free to be used.
free_existing_tensors->insert(name);
// Register the space it has.
PADDLE_ENFORCE(space_table->count(name));
space_table->at(name) = std::max(space_table->at(name), space_required);
// The allocated new tensor use the memory of itself.
(*reuse_table)[name] = name;
}
// Free a tensor and make it resuable.
// @tensor: the tensor to free.
// @free_existing_tensors: the free and allocated tensors.
// @reuse_table: a map from a fake tensor to the existing allocated tensor.
void FreeATensor(const std::string& tensor,
std::unordered_set<std::string>* free_existing_tensors,
std::unordered_map<std::string, std::string>* reuse_table) {
if (tensor == "feed" || tensor == "fetch") return;
// the really allocated tensor.
const auto& free_tensor = reuse_table->at(tensor);
free_existing_tensors->insert(free_tensor);
}
// Reuse a free existing tensor.
void ReuseATensor(const std::string& tensor, const std::string& tensor2reuse,
size_t memory_size,
std::unordered_set<std::string>* free_existing_tensors,
std::unordered_map<std::string, std::string>* reuse_table,
space_table_t* reused_space_table) {
auto it = free_existing_tensors->find(tensor2reuse);
PADDLE_ENFORCE(it != free_existing_tensors->end());
free_existing_tensors->erase(it);
(*reuse_table)[tensor] = tensor2reuse;
// Update the memory size of a reused tensor, the memory will grow if the
// required memory is larger.
(*reused_space_table)[tensor2reuse] =
std::max(reused_space_table->at(tensor2reuse), memory_size);
}
// Calculate the memory usage.
void EvaluateMemoryUsage(
const std::unordered_map<std::string, std::string>& reuse_table,
const space_table_t& space_table,
const std::unordered_map<std::string, size_t>& var_batch_ave_size,
size_t* allocated, size_t* saved) {
*allocated = 0;
*saved = 0;
for (auto elem : reuse_table) {
if (elem.first == elem.second) {
*allocated += space_table.at(elem.first);
VLOG(4) << elem.first << " <-> " << elem.second << " "
<< space_table.at(elem.first) << " "
<< space_table.at(elem.second);
} else {
*saved += space_table.at(elem.first);
VLOG(4) << "reuse " << elem.first << " -> " << elem.second;
}
}
VLOG(4) << "allocated " << *allocated;
VLOG(4) << "saved " << *saved;
}
// Return saved ratio.
void MemoryOptimizePass::MakeReusePlan(
const std::vector<std::unordered_set<std::string>>& var_clusters,
const std::unordered_map<std::string, size_t>& var_batch_ave_size,
const space_table_t& space_table,
std::unordered_map<std::string, std::string>* reuse_table, int sort_kind,
MemoryAllocation* memory_allocation) const {
// Clear the existing plan.
reuse_table->clear();
// The `space_table` stores the real memory size for each tensor.
// The `reused_space_table` stores the maximum memory size required by a
// tensor during the memory reusing, the small tensor might be reused by a
// larger tensor, and the memory size of the small one will grow.
auto reused_space_table = space_table;
std::unordered_map<std::string, lifecycle_t> life_cycles;
std::unordered_map<std::string, Node*> tensor_nodes;
// The allocated tensors whose memory can be reused, they will live across the
// program execution.
std::unordered_set<std::string> existing_tensors;
// The existing tensor that has been allocated, and is also free to reuse.
std::unordered_set<std::string> free_existing_tensors;
CollectLifeCycle(&life_cycles, sort_kind);
for (int age = 0; age < max_lifecycle_; ++age) {
std::unordered_set<std::string> born_tensors;
std::unordered_set<std::string> dead_tensors;
// Gather the dead and born tensors.
for (auto elem_it = life_cycles.begin(); elem_it != life_cycles.end();
elem_it++) {
if (elem_it->second.first == -1) {
continue;
}
const auto& tensor = elem_it->first;
const auto& lifecycle = elem_it->second;
VLOG(4) << "process " << tensor << " reuse " << lifecycle.first << "->"
<< lifecycle.second;
// Collect newly born tensors.
if (lifecycle.first == age) {
born_tensors.insert(tensor);
}
// Collect dead tensors whose memory can be reused.
else if (lifecycle.second < age) { // NOLINT
dead_tensors.insert(tensor);
// remove to avoid duplicate process.
elem_it->second.first = -1; // avoid duplicate search
}
}
// Reuse the dead tensors for born tensors
for (const auto& tensor : born_tensors) {
// Skip the feed and fetch tensor for that they share data with others.
std::string tensor2reuse;
if (!space_table.count(tensor)) continue;
size_t space_required = space_table.at(tensor);
if (FindSuitableTensorToReuse(tensor, space_required, tensor_nodes,
&free_existing_tensors, reused_space_table,
var_clusters, &tensor2reuse)) {
if (tensor != tensor2reuse) {
VLOG(4) << tensor << " -> " << tensor2reuse;
}
ReuseATensor(tensor, tensor2reuse, space_required,
&free_existing_tensors, reuse_table, &reused_space_table);
} else {
VLOG(4) << "allocate " << tensor;
AllocateNewTensor(tensor, space_required, tensor_nodes,
&free_existing_tensors, &reused_space_table,
reuse_table);
ReuseATensor(tensor, tensor, space_required, &free_existing_tensors,
reuse_table, &reused_space_table);
}
}
for (const auto& tensor : dead_tensors) {
// free its memory.
FreeATensor(tensor, &free_existing_tensors, reuse_table);
}
}
EvaluateMemoryUsage(*reuse_table, reused_space_table, var_batch_ave_size,
&(memory_allocation->allocated),
&(memory_allocation->saved));
memory_allocation->sort_kind = sort_kind;
}
void BuildVarNodeTable(Graph* graph,
std::unordered_map<std::string, Node*>* var_node_table) {
for (auto* node : graph->Nodes()) {
if (node->IsVar()) {
(*var_node_table)[node->Name()] = node;
}
}
}
// NOTE The optimized opdesc doesn't match ir::Graph.
void UpdateOpDescsByReuse(
Graph* graph,
const std::unordered_map<std::string, std::string>& reuse_table,
int sort_kind) {
// TODO(Superjomn) change here to be compatible with the runtime order.
for (auto* node : TopologyVarientSort(
*graph, static_cast<framework::ir::SortKind>(sort_kind))) {
if (node->IsOp()) {
// Replace the original inputs/outputs with the reused tensors.
std::unordered_map<std::string, std::vector<std::string>> in_args,
out_args;
for (auto argument : node->Op()->Inputs()) {
for (const auto& x : argument.second) {
auto name = x;
if (reuse_table.count(x) && reuse_table.at(x) != x) {
name = reuse_table.at(x);
}
in_args[argument.first].push_back(name);
VLOG(4) << node->Name() << " input " << x << " -> " << name;
}
}
for (auto argument : node->Op()->Outputs()) {
for (const auto& x : argument.second) {
auto name = x;
if (reuse_table.count(x) && reuse_table.at(x) != x) {
name = reuse_table.at(x);
}
out_args[argument.first].push_back(name);
VLOG(4) << node->Name() << " output " << x << " -> " << name;
}
}
// Update arguments.
for (auto& arg : in_args) {
node->Op()->SetInput(arg.first, arg.second);
}
for (auto& arg : out_args) {
node->Op()->SetOutput(arg.first, arg.second);
}
node->Op()->Flush();
}
}
}
void MemoryOptimizePass::PerformReusePlan(
const std::unordered_map<std::string, std::string>& reuse_table,
int sort_kind, std::unordered_set<std::string>* vars2remove) const {
std::unordered_map<std::string, Node*> var_node_table;
BuildVarNodeTable(graph_, &var_node_table);
UpdateOpDescsByReuse(graph_, reuse_table, sort_kind);
for (auto& item : reuse_table) {
if (item.first != item.second) {
vars2remove->insert(item.first);
}
}
VLOG(2) << "to remove vars " << vars2remove->size();
}
std::vector<std::string> split(const std::string& line, char delim) {
std::vector<std::string> res;
std::string field;
std::stringstream line_stream(line);
while (std::getline(line_stream, field, delim)) {
res.emplace_back(field);
}
return res;
}
// Deserialize the batch var shapes from the cache file.
std::vector<std::map<std::string, std::vector<int>>> DeseralizeBatchVarShapes(
const std::string& path) {
std::ifstream file(path);
PADDLE_ENFORCE(file.is_open(), "failed to open %s to read cache", path);
std::string line;
std::vector<std::map<std::string, std::vector<int>>> batch_shapes;
while (std::getline(file, line)) {
std::map<std::string, std::vector<int>> batch;
for (const auto& var_info : split(line, ';')) {
auto fields = split(var_info, ':');
PADDLE_ENFORCE_EQ(fields.size(), 2UL);
auto var_name = fields.front();
auto shape_str = split(fields[1], ',');
std::vector<int> shape;
for (const auto& v : shape_str) shape.push_back(std::stoi(v));
batch[var_name] = shape;
}
batch_shapes.push_back(batch);
}
return batch_shapes;
}
// Calculate the average dim of each tensor from the batch shape cache.
std::unordered_map<std::string, size_t> GetBatchAverageSize(
const std::vector<std::map<std::string, std::vector<int>>>& batches) {
std::unordered_map<std::string, size_t> var2size;
// The average size of the batches for each variable.
int num_batch = 0;
for (const auto& batch : batches) {
num_batch++;
for (const auto& item : batch) {
int dim = std::accumulate(item.second.begin(), item.second.end(), 1,
[](int a, int b) { return a * b; });
var2size[item.first] += dim;
}
}
for (auto& item : var2size) {
item.second /= num_batch;
}
return var2size;
}
// Analysis the batch shapes loading from the cache file.
// By splitting the variables to different clusters by analyzing their batch
// size, we can pre-schedule the changes of difference LoDTensor when different
// length of input sequences is entered.
// This should works fine for the models operating on sentences.
std::vector<std::unordered_set<std::string>> AnalysisBatchShapesByBatchSize(
const std::vector<std::map<std::string, std::vector<int>>>& batches) {
// collect the batch size of each shape and combine to a stringstream in
// converient to generate a hash.
std::unordered_map<std::string, std::stringstream> var_batchsize_hashes;
for (auto& batch : batches) {
for (auto& ele : batch) {
int batch_size = ele.second.front();
// TODO(Superjomn) might consume large memory here, use combine hash.
var_batchsize_hashes[ele.first] << batch_size;
}
}
// Split to sets by batch size sequences.
std::unordered_map<size_t /*hash*/, std::unordered_set<std::string>>
shape_sets;
for (auto& ele : var_batchsize_hashes) {
auto hash = std::hash<std::string>()(ele.second.str());
shape_sets[hash].insert(ele.first);
}
std::vector<std::unordered_set<std::string>> res;
for (auto& ele : shape_sets) {
res.emplace_back(std::move(ele.second));
}
VLOG(3) << "Cluster by batch_size and get " << res.size() << " clusters";
return res;
}
// Analysis the batch shapes loading from the cache file, and split them to
// different clusters by their size.
// This should works fine for the overall models.
std::vector<std::unordered_set<std::string>> AnalysisBatchShapesBySimilarSize(
const space_table_t& space_table,
const std::vector<std::map<std::string, std::vector<int>>>& batches,
int interval = 200000) {
PADDLE_ENFORCE_GT(interval, 0);
// cluster to different clusters.
size_t max_size = 0;
for (auto& item : space_table) {
max_size = std::max(item.second, max_size);
}
VLOG(4) << "tensor max size " << max_size;
std::vector<std::unordered_set<std::string>> res;
// cluster by intervals.
for (size_t interval_size = 0; interval_size <= max_size;
interval_size += interval) {
std::unordered_set<std::string> cluster;
for (auto& item : space_table) {
if (interval_size <= item.second &&
interval_size + interval > item.second) {
cluster.insert(item.first);
}
}
if (!cluster.empty()) {
res.push_back(cluster);
}
}
VLOG(3) << "Cluster by interval and get " << res.size() << " cluster";
return res;
}
std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
void MemoryOptimizePass::RunImpl(Argument* argument) {
// When force update, should not optimize memory.
if (!argument->enable_memory_optim() || argument->memory_optim_force_update())
return;
graph_ = argument->main_graph_ptr();
auto path = GetMemoryCachePath(
argument->model_dir_valid() ? argument->model_dir() : "",
argument->model_program_path_valid() ? argument->model_program_path()
: "");
VLOG(3) << "Load memory cache from " << path;
if (inference::IsFileExists(path)) {
VLOG(4) << "Performing memory optimize";
auto batches = DeseralizeBatchVarShapes(path);
auto var_batch_ave_size = GetBatchAverageSize(batches);
std::unordered_map<std::string, Node*> tensor_nodes;
space_table_t space_table;
CollectVarMemorySize(var_batch_ave_size, &tensor_nodes, &space_table);
std::unordered_map<std::string, std::string> reuse_table;
double max_saving_ratio = 0.;
std::vector<std::function<MemoryAllocation()>> strategies;
for (int sort_kind = 0; sort_kind < 2; sort_kind++) {
strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_batch_size =
AnalysisBatchShapesByBatchSize(batches);
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_batch_size, var_batch_ave_size,
space_table, &reuse_table, sort_kind, &allocation);
return allocation;
});
strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
space_table, batches, 1024); // interval 1kb
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
space_table, &reuse_table, sort_kind, &allocation);
return allocation;
});
strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
space_table, batches, 1024 * 1024); // interval 1MB
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
space_table, &reuse_table, sort_kind, &allocation);
return allocation;
});
strategies.emplace_back([&, sort_kind] {
auto clustered_vars_by_ave_size = AnalysisBatchShapesBySimilarSize(
space_table, batches,
std::numeric_limits<int>::max()); // no intervals
MemoryAllocation allocation;
MakeReusePlan(clustered_vars_by_ave_size, var_batch_ave_size,
space_table, &reuse_table, sort_kind, &allocation);
return allocation;
});
}
std::function<MemoryAllocation()>* best_strategy{nullptr};
// Try all strategies to get the best result.
for (auto& strategy : strategies) {
auto allocation = strategy();
string::PrettyLogDetail("--- get strategy saving %f memory for workspace",
allocation.GetSavingRatio());
if (allocation.GetSavingRatio() > max_saving_ratio) {
max_saving_ratio = allocation.GetSavingRatio();
best_strategy = &strategy;
}
}
if (!best_strategy) {
LOG(ERROR)
<< "This model makes poor memory optimize, skip memory optimize";
return;
}
auto memory_allocation = (*best_strategy)();
string::PrettyLogH2(
"--- Saved %.2f%s memory for workspace(temporary variables)",
memory_allocation.GetSavingRatio() * 100, "%");
string::PrettyLogDetail("--- Allocated %d MB",
memory_allocation.allocated / 1024. / 1024.);
string::PrettyLogDetail("--- Saved %d MB",
memory_allocation.saved / 1024. / 1024.);
argument->main_graph().Set(framework::ir::kGraphToProgramVarsToRemove,
new std::unordered_set<std::string>);
auto& vars2remove =
argument->main_graph().Get<std::unordered_set<std::string>>(
framework::ir::kGraphToProgramVarsToRemove);
PerformReusePlan(reuse_table, memory_allocation.sort_kind, &vars2remove);
argument->SetMemoryOptimSortKind(memory_allocation.sort_kind);
}
}
float MemoryOptimizePass::MemoryAllocation::GetSavingRatio() const {
return (saved / 1024.) / (allocated / 1024. + saved / 1024.);
}
} // namespace analysis
} // namespace inference
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/inference/analysis/analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
/*
* Memory optimization pass for inference with pre-analysis of memory usage
* without GC.
* Different from training, the inference memory reuse strategies doesn't
* include GC for that overhead is too much when batch size equals one.
*
* The inference memory reuse tries to pre-determine the tensor reusing strategy
* without runtime overhead.
*
* To improve the strategy's performance, a warm-up running is introduced:
* - Before officially deploy the inference program, one should warm it up and
* generate some runtime cache,
* - Run the inference program with several batches of data, it will persist
* some runtime information about memory of tensors to disk, we call the
* information the memory reusing cache,
* - With the memory reusing cache, user can deploy the inference to a
* service, before running the model, the inference program will load the
* memory cache, analysis it and generate the best memory reusing strategy,
* and adjust the execution of the network.
*
* With the warm-up and memory reusing cache design, the memory reusing
* algorithm can analysis the real memory consume of the tensors, even with the
* flexible LoDTensor and special shape changing operators such as
* sequence-pooling.
*/
class MemoryOptimizePass : public AnalysisPass {
public:
using space_table_t = std::unordered_map<std::string, size_t>;
using lifecycle_t = std::pair<int, int>;
struct MemoryAllocation {
size_t allocated; // allocated memory in byte.
size_t saved; // saved memory in byte.
int sort_kind; // the kind of the corresponding sorting algorithm.
// Get the memory saving ratio of temporary variables.
float GetSavingRatio() const;
};
virtual ~MemoryOptimizePass() = default;
protected:
void RunImpl(Argument *argument) override;
private:
void CollectLifeCycle(
std::unordered_map<std::string, lifecycle_t> *lifecycles,
int sort_kind) const;
void CollectVarMemorySize(
const std::unordered_map<std::string, size_t> &batch_var_ave_dim,
std::unordered_map<std::string, framework::ir::Node *> *tensor_nodes,
space_table_t *space_table) const;
// Returns percentage of saved memory.
void MakeReusePlan(
const std::vector<std::unordered_set<std::string>> &var_clusters,
const std::unordered_map<std::string, size_t> &var_batch_ave_size,
const space_table_t &space_table,
std::unordered_map<std::string, std::string> *reuse_table, int sort_kind,
MemoryAllocation *memory_allocation) const;
void PerformReusePlan(
const std::unordered_map<std::string, std::string> &reuse_table,
int sort_kind, std::unordered_set<std::string> *vars2remove) const;
public:
std::string repr() const override;
private:
mutable framework::ir::Graph *graph_{nullptr};
mutable int max_lifecycle_{-1};
};
static std::string GetMemoryCachePath(const std::string &model_path,
const std::string &prog_path) {
auto path = model_path.empty() ? prog_path : model_path;
return path + ".memory_cache";
}
} // namespace analysis
} // namespace inference
} // namespace paddle
......@@ -13,24 +13,31 @@
// limitations under the License.
#include "paddle/fluid/inference/analysis/passes/passes.h"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc"
#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
#include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
namespace paddle {
namespace inference {
namespace analysis {
PassRegistry::PassRegistry() {
// Register manually to avoid the trivial `USE_OP` like macro for easier use
// and link.
passes_.emplace("ir_analysis_pass",
std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
passes_.emplace("ir_graph_build_pass",
std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
passes_.emplace("ir_analysis_compose_pass",
std::unique_ptr<AnalysisPass>(new IrAnalysisComposePass));
passes_.emplace("memory_optimize_pass",
std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
passes_.emplace(
"ir_params_sync_among_devices_pass",
std::unique_ptr<AnalysisPass>(new IrParamsSyncAmongDevicesPass));
passes_.emplace(
"ir_graph_to_program_pass",
std::unique_ptr<IrGraphToProgramPass>(new IrGraphToProgramPass));
}
} // namespace analysis
......
......@@ -18,8 +18,10 @@ if(APPLE)
endif(APPLE)
set(inference_deps paddle_inference_api paddle_fluid_api analysis pass
ir_pass_manager naive_executor analysis_predictor ${GLOB_PASS_LIB})
set(inference_deps ${analysis_deps}
paddle_inference_api paddle_fluid_api
analysis pass naive_executor
${GLOB_PASS_LIB})
if(WITH_GPU AND TENSORRT_FOUND)
set(inference_deps ${inference_deps} tensorrt_engine tensorrt_converter)
......@@ -29,7 +31,8 @@ add_subdirectory(details)
cc_library(analysis_config SRCS analysis_config.cc DEPS lod_tensor paddle_pass_builder)
cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api analysis naive_executor zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager)
cc_library(analysis_predictor SRCS analysis_predictor.cc DEPS paddle_inference_api zero_copy_tensor
reset_tensor_array analysis_config paddle_pass_builder ir_pass_manager ${inference_deps})
cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS
lod_tensor scope paddle_pass_builder reset_tensor_array analysis_config
analysis_config paddle_pass_builder zero_copy_tensor
......@@ -44,7 +47,7 @@ if(WITH_TESTING)
ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
endif()
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps}
cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
ARGS --dirname=${WORD2VEC_MODEL_DIR})
if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
......
......@@ -44,16 +44,22 @@ PassStrategy *contrib::AnalysisConfig::pass_builder() const {
contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) {
model_dir_ = model_dir;
Update();
}
contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file,
const std::string &params_file) {
prog_file_ = prog_file;
params_file_ = params_file;
Update();
}
void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path,
const std::string &params_file_path) {
prog_file_ = prog_file_path;
params_file_ = params_file_path;
Update();
}
void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
int device_id) {
......@@ -62,11 +68,17 @@ void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
memory_pool_init_size_mb_ = memory_pool_init_size_mb;
device_id_ = device_id;
#else
LOG(ERROR) << "Please compile with gpu to EnableGpu";
LOG(ERROR) << "Please compile with gpu to EnableGpu()";
use_gpu_ = false;
#endif
Update();
}
void contrib::AnalysisConfig::DisableGpu() {
use_gpu_ = false;
Update();
}
void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; }
contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
#define CP_MEMBER(member__) member__ = other.member__;
......@@ -81,6 +93,9 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
CP_MEMBER(enable_memory_optim_);
CP_MEMBER(memory_optim_force_update_);
// TensorRT releated.
CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_);
......@@ -109,6 +124,8 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) {
}
#undef CP_MEMBER
Update();
}
void contrib::AnalysisConfig::EnableMKLDNN() {
......@@ -119,33 +136,64 @@ void contrib::AnalysisConfig::EnableMKLDNN() {
LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
use_mkldnn_ = false;
#endif
Update();
}
void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size,
int max_batch_size,
int min_subgraph_size) {
#ifdef PADDLE_WITH_CUDA
if (!use_gpu()) {
LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
return;
}
use_tensorrt_ = true;
tensorrt_workspace_size_ = workspace_size;
tensorrt_max_batchsize_ = max_batch_size;
tensorrt_min_subgraph_size_ = min_subgraph_size;
Update();
#else
LOG(ERROR)
<< "To use TensorRT engine, please compile inference lib with GPU first.";
#endif
}
// TODO(Superjomn) refactor this, buggy.
void contrib::AnalysisConfig::Update() {
auto info = SerializeInfoCache();
if (info == serialized_info_cache_) return;
if (use_gpu_) {
pass_builder_.reset(new GpuPassStrategy);
// Transfer pass_builder and copy the existing compatible passes.
if (!pass_builder_ || ((use_gpu() ^ pass_builder_->use_gpu()))) {
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy);
if (use_tensorrt_) {
// Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
} else {
pass_builder_.reset(new CpuPassStrategy);
}
} else {
pass_builder_.reset(new CpuPassStrategy);
if (use_gpu()) {
pass_builder_.reset(new GpuPassStrategy(
*static_cast<GpuPassStrategy *>(pass_builder_.get())));
} else {
pass_builder_.reset(new CpuPassStrategy(
*static_cast<CpuPassStrategy *>(pass_builder_.get())));
}
}
if (use_tensorrt_) {
if (!use_gpu_) {
LOG(ERROR)
<< "TensorRT engine is not available when EnableGpu() not actived.";
} else {
const auto &passes = pass_builder_->AllPasses();
if (std::find(passes.begin(), passes.end(), "tensorrt_subgraph_pass") ==
std::end(passes)) {
// Append after the Affine_channel_conv_fuse pass.
pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
}
......@@ -165,6 +213,10 @@ void contrib::AnalysisConfig::Update() {
#endif
}
if (enable_memory_optim_) {
pass_builder()->AppendAnalysisPass("memory_optimize_pass");
}
if (ir_debug_) {
pass_builder()->TurnOnDebug();
}
......@@ -172,24 +224,43 @@ void contrib::AnalysisConfig::Update() {
std::string contrib::AnalysisConfig::SerializeInfoCache() {
std::stringstream ss;
ss << model_dir_;
ss << prog_file_;
ss << params_file_;
ss << use_gpu_;
ss << device_id_;
ss << memory_pool_init_size_mb_;
ss << use_tensorrt_;
ss << tensorrt_workspace_size_;
ss << tensorrt_max_batchsize_;
ss << tensorrt_min_subgraph_size_;
ss << enable_memory_optim_;
ss << memory_optim_force_update_;
ss << use_mkldnn_;
for (auto &item : mkldnn_enabled_op_types_) ss << item;
ss << ";";
ss << model_from_memory_;
ss << enable_ir_optim_;
ss << use_feed_fetch_ops_;
ss << ir_debug_;
ss << specify_input_name_;
ss << cpu_math_library_num_threads_;
return ss.str();
}
void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads(
int cpu_math_library_num_threads) {
cpu_math_library_num_threads_ = cpu_math_library_num_threads;
Update();
}
float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
......@@ -207,6 +278,17 @@ float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif
}
void contrib::AnalysisConfig::EnableMemoryOptim(bool force_update_cache) {
enable_memory_optim_ = true;
memory_optim_force_update_ = force_update_cache;
Update();
}
bool contrib::AnalysisConfig::enable_memory_optim() const {
return enable_memory_optim_;
}
void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
size_t prog_buffer_size,
const char *param_buffer,
......@@ -214,6 +296,8 @@ void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer,
prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size);
params_file_ = std::string(param_buffer, param_buffer + param_buffer_size);
model_from_memory_ = true;
Update();
}
} // namespace paddle
......@@ -24,18 +24,21 @@
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#if PADDLE_WITH_TENSORRT
#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
#endif
DECLARE_bool(profile);
namespace paddle {
......@@ -189,6 +192,12 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
LOG(ERROR) << "fail to get fetches";
return false;
}
// Collect variable shapes for memory optimization.
if (need_collect_var_shapes_for_memory_optim()) {
CollectVarShapes();
}
VLOG(3) << "predict cost: " << timer.toc() << "ms";
// All the containers in the scope will be hold in inference, but the
......@@ -317,6 +326,8 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
argument_.SetUseGPU(config_.use_gpu());
argument_.SetGPUDeviceId(config_.gpu_device_id());
argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
argument_.SetMemoryOptimForceUpdate(config_.memory_optim_force_update_);
argument_.SetModelFromMemory(config_.model_from_memory_);
// Analyze inference_program
if (!config_.model_dir().empty()) {
......@@ -331,6 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
if (config_.use_gpu() && config_.tensorrt_engine_enabled()) {
LOG(INFO) << "TensorRT subgraph engine is enabled";
argument_.SetUseTensorRT(true);
argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_);
argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
......@@ -338,12 +350,17 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
}
if (config_.use_mkldnn_) {
LOG(INFO) << "MKLDNN is enabled";
argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_);
}
auto passes = config_.pass_builder()->AllPasses();
if (!config_.ir_optim()) passes.clear();
if (!config_.ir_optim()) {
passes.clear();
LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
}
argument_.SetIrAnalysisPasses(passes);
argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
argument_.SetScopeNotOwned(const_cast<framework::Scope *>(scope_.get()));
Analyzer().Run(&argument_);
......@@ -558,6 +575,13 @@ AnalysisPredictor::~AnalysisPredictor() {
if (sub_scope_) {
scope_->DeleteScope(sub_scope_);
}
// TODO(Superjomn) deduce the directory path.
std::string out_path = inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file());
if (need_collect_var_shapes_for_memory_optim()) {
SerializeBatchVarShapes(out_path);
}
}
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
......@@ -567,6 +591,66 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
return std::unique_ptr<PaddlePredictor>(x);
}
void AnalysisPredictor::CollectVarShapes() {
VLOG(4) << "Collecting var shapes";
if (batch_var_shapes_.size() >= max_shape_collect_count_) return;
std::map<std::string, std::vector<int>> var_shapes;
for (auto var_name : inference_program_->Block(0).LocalVarNames()) {
auto *var = sub_scope_->FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var);
if (var->Type() == framework::VarTypeTrait<framework::LoDTensor>::kId ||
var->Type() == framework::VarTypeTrait<framework::Tensor>::kId) {
auto &tensor = var->Get<framework::LoDTensor>();
auto shape = framework::vectorize(tensor.dims());
var_shapes[var_name].assign(shape.begin(), shape.end());
}
}
batch_var_shapes_.push_back(var_shapes);
LOG_FIRST_N(INFO, 1) << "Collected " << batch_var_shapes_.size()
<< " batch of var shapes for analysis";
}
void AnalysisPredictor::SerializeBatchVarShapes(const std::string &path) {
LOG(INFO) << "serialize batch var shapes to " << path;
std::ofstream file(path);
if (!file.is_open()) {
LOG(ERROR) << "failed to serialize the var shapes to " << path;
return;
}
// The sirialized data format:
// <tensor_name>:dim0,dim1,dim2,;
for (auto &batch : batch_var_shapes_) {
for (auto &ele : batch) {
file << ele.first << ":";
for (size_t i = 0; i < ele.second.size() - 1; i++) {
file << ele.second[i] << ",";
}
file << ele.second.back() << ";";
}
file << "\n";
}
}
bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
if (need_collect_var_shapes_ >= 0) return need_collect_var_shapes_;
bool need = false;
// check if the cache exists
if (!config_.enable_memory_optim()) {
need = false;
} else if (config_.enable_memory_optim() &&
!inference::IsFileExists(inference::analysis::GetMemoryCachePath(
config_.model_dir(), config_.prog_file()))) {
need = true;
} else if (config_.enable_memory_optim() &&
config_.memory_optim_force_update_) {
need = true;
}
need_collect_var_shapes_ = need ? 1 : 0;
return need;
}
template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
const contrib::AnalysisConfig &config) {
......
......@@ -75,6 +75,11 @@ class AnalysisPredictor : public PaddlePredictor {
void SetMkldnnThreadID(int tid);
protected:
// For memory optimization.
bool need_collect_var_shapes_for_memory_optim();
void CollectVarShapes();
void SerializeBatchVarShapes(const std::string &path);
bool PrepareProgram(const std::shared_ptr<framework::ProgramDesc> &program);
bool PrepareScope(const std::shared_ptr<framework::Scope> &parent_scope);
bool CreateExecutor();
......@@ -118,6 +123,11 @@ class AnalysisPredictor : public PaddlePredictor {
// A mutex help to make Clone thread safe.
std::mutex clone_mutex_;
// For memory optimization.
const size_t max_shape_collect_count_{1000};
int need_collect_var_shapes_{-1}; // -1 for default, 0 for false, 1 for true.
std::vector<std::map<std::string, std::vector<int>>> batch_var_shapes_;
private:
// Some status here that help to determine the status inside the predictor.
bool status_program_optimized_{false};
......
......@@ -16,8 +16,10 @@
#include <glog/logging.h>
#include <gtest/gtest.h>
#include <thread> // NOLINT
#include "paddle/fluid/framework/ir/pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
DEFINE_string(dirname, "", "dirname to tests.");
......@@ -191,4 +193,53 @@ TEST(AnalysisPredictor, Clone) {
}
}
TEST(AnalysisPredictor, memory_optim) {
AnalysisConfig config(FLAGS_dirname);
config.DisableGpu();
config.EnableMemoryOptim(true);
config.pass_builder()->TurnOnDebug();
auto native_predictor =
CreatePaddlePredictor<NativeConfig>(config.ToNativeConfig());
// 2. Dummy Input Data
int64_t data[4] = {1, 2, 3, 4};
PaddleTensor tensor;
tensor.shape = std::vector<int>({4, 1});
tensor.data.Reset(data, sizeof(data));
tensor.dtype = PaddleDType::INT64;
std::vector<PaddleTensor> inputs(4, tensor);
std::vector<PaddleTensor> output, output1;
{
// The first predictor help to cache the memory optimize strategy.
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
// Run several times to check the parameters are not reused by mistake.
for (int i = 0; i < 5; i++) {
ASSERT_TRUE(predictor->Run(inputs, &output));
}
}
{
output.clear();
// The second predictor to perform memory optimization.
config.EnableMemoryOptim(false);
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
// Run with memory optimization
ASSERT_TRUE(predictor->Run(inputs, &output));
}
// Run native
ASSERT_TRUE(native_predictor->Run(inputs, &output1));
LOG(INFO) << "the output " << inference::DescribeTensor(output.front());
LOG(INFO) << "the native output "
<< inference::DescribeTensor(output1.front());
inference::CompareResult(output, output1);
}
} // namespace paddle
#!/bin/bash
set -x
PADDLE_ROOT=$1
TURN_ON_MKL=$2 # use MKL or Openblas
......
......@@ -15,7 +15,10 @@
#pragma once
#include <glog/logging.h>
#include <fstream>
#if !defined(_WIN32)
#include <sys/time.h>
#endif
#include <algorithm>
#include <chrono> // NOLINT
#include <iterator>
......@@ -182,7 +185,8 @@ static bool CompareTensor(const PaddleTensor &a, const PaddleTensor &b) {
return true;
}
static std::string DescribeTensor(const PaddleTensor &tensor) {
static std::string DescribeTensor(const PaddleTensor &tensor,
int max_num_of_data = 15) {
std::stringstream os;
os << "Tensor [" << tensor.name << "]\n";
os << " - type: ";
......@@ -253,5 +257,12 @@ static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
}
}
static bool IsFileExists(const std::string &path) {
std::ifstream file(path);
bool exists = file.is_open();
file.close();
return exists;
}
} // namespace inference
} // namespace paddle
......@@ -192,6 +192,13 @@ struct AnalysisConfig {
*/
bool model_from_memory() const { return model_from_memory_; }
/** Turn on memory optimize
* NOTE still in development, will release latter.
*/
void EnableMemoryOptim(bool force_update_cache = false);
/** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const;
friend class ::paddle::AnalysisPredictor;
/** NOTE just for developer, not an official API, easily to be broken.
......@@ -232,6 +239,10 @@ struct AnalysisConfig {
// subgraph, 3 as default value.
int tensorrt_min_subgraph_size_{3};
// memory reuse related.
bool enable_memory_optim_{false};
bool memory_optim_force_update_{false};
bool use_mkldnn_{false};
std::unordered_set<std::string> mkldnn_enabled_op_types_;
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include <glog/logging.h>
namespace paddle {
......@@ -65,4 +66,8 @@ void GpuPassStrategy::EnableMKLDNN() {
LOG(ERROR) << "GPU not support MKLDNN yet";
}
void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
analysis_passes_.push_back(pass);
}
} // namespace paddle
......@@ -45,6 +45,9 @@ class PaddlePassBuilder {
/** Delete all the passes that has type `pass_type`. */
void DeletePass(const std::string &pass_type);
/** Append an analysis pass. */
void AppendAnalysisPass(const std::string &pass);
/** Visualize the computation graph after each pass by generating a DOT
* language file, one can draw them with the Graphviz toolkit.
*/
......@@ -54,8 +57,18 @@ class PaddlePassBuilder {
std::string DebugString();
const std::vector<std::string> &AllPasses() const { return passes_; }
std::vector<std::string> AnalysisPasses() const {
auto passes = analysis_passes_;
// To make sure the ir_graph_to_program should be the last pass so any
// modication of IR will persist to the program.
passes.push_back("ir_graph_to_program_pass");
return passes;
}
protected:
std::vector<std::string> analysis_passes_{
{"ir_graph_build_pass", "ir_analysis_pass",
"ir_params_sync_among_devices_pass"}};
std::vector<std::string> passes_;
};
......@@ -69,7 +82,7 @@ class PassStrategy : public PaddlePassBuilder {
/** The MKLDNN control exists in both CPU and GPU mode, because there can be
* still some CPU kernels running in CPU mode.
*/
virtual void EnableMKLDNN() = 0;
virtual void EnableMKLDNN() {}
bool use_gpu() const { return use_gpu_; }
......@@ -77,6 +90,7 @@ class PassStrategy : public PaddlePassBuilder {
protected:
bool use_gpu_{false};
bool use_mkldnn_{false};
};
/** The CPU passes controller, it is used in AnalysisPredictor with CPU mode.
......@@ -107,25 +121,31 @@ class CpuPassStrategy : public PassStrategy {
use_gpu_ = false;
}
explicit CpuPassStrategy(const CpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {}
virtual ~CpuPassStrategy() = default;
void EnableMKLDNN() override {
// TODO(Superjomn) Consider the way to mix CPU with GPU.
#ifdef PADDLE_WITH_MKLDNN
passes_.insert(passes_.begin(), "mkldnn_placement_pass");
for (auto &pass :
std::vector<std::string>({"depthwise_conv_mkldnn_pass", //
"conv_bias_mkldnn_fuse_pass", //
"conv3d_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass"})) {
passes_.push_back(pass);
if (!use_mkldnn_) {
passes_.insert(passes_.begin(), "mkldnn_placement_pass");
for (auto &pass : std::vector<std::string>(
{"depthwise_conv_mkldnn_pass", //
"conv_bias_mkldnn_fuse_pass", //
"conv3d_bias_mkldnn_fuse_pass", //
"conv_relu_mkldnn_fuse_pass", //
"conv_elementwise_add_mkldnn_fuse_pass"})) {
passes_.push_back(pass);
}
}
use_mkldnn_ = true;
#else
use_mkldnn_ = false;
#endif
}
CpuPassStrategy(const CpuPassStrategy &other) : PassStrategy(other.passes_) {}
};
/** The GPU passes strategy, it is used in AnalysisPredictor with GPU mode.
......@@ -150,7 +170,7 @@ class GpuPassStrategy : public PassStrategy {
use_gpu_ = true;
}
GpuPassStrategy(const GpuPassStrategy &other)
explicit GpuPassStrategy(const GpuPassStrategy &other)
: PassStrategy(other.AllPasses()) {
use_gpu_ = true;
}
......
......@@ -19,7 +19,7 @@ endfunction()
function(inference_analysis_api_test target install_dir filename)
inference_analysis_test(${target} SRCS ${filename}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} benchmark
ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt)
endfunction()
......@@ -62,7 +62,7 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2
# normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL)
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
# small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
......
......@@ -126,6 +126,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
std::string turn_mask_pre = "turn_mask_";
auto one_batch = data->NextBatch();
PADDLE_ENFORCE(!one_batch.response.empty());
int size = one_batch.response[0].size();
CHECK_EQ(size, kMaxTurnLen);
// turn tensor assignment
......@@ -200,6 +201,7 @@ void profile(bool use_mkldnn = false) {
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all, &outputs, FLAGS_num_threads);
......@@ -250,7 +252,35 @@ void compare(bool use_mkldnn = false) {
reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
}
// Compare result of NativeConfig and AnalysisConfig with memory optimization.
TEST(Analyzer_dam, compare_with_memory_optim) {
// The small dam will core in CI, but works in local.
if (FLAGS_max_turn_num == 9) {
contrib::AnalysisConfig cfg, cfg1;
DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
// Run the first time to force to update memory cache
SetConfig(&cfg);
cfg.EnableMemoryOptim(true);
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
input_slots_all);
// Run second time to use the memory cache and perform memory optimization.
SetConfig(&cfg1);
cfg1.EnableMemoryOptim();
CompareNativeAndAnalysis(
reinterpret_cast<const PaddlePredictor::Config *>(&cfg1),
input_slots_all);
}
}
TEST(Analyzer_dam, compare) { compare(); }
#ifdef PADDLE_WITH_MKLDNN
TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); }
#endif
......
......@@ -69,6 +69,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
TEST(Analyzer_Text_Classification, profile) {
AnalysisConfig cfg;
SetConfig(&cfg);
cfg.pass_builder()->TurnOnDebug();
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
......@@ -98,6 +99,7 @@ TEST(Analyzer_Text_Classification, profile) {
TEST(Analyzer_Text_Classification, compare) {
AnalysisConfig cfg;
SetConfig(&cfg);
cfg.EnableMemoryOptim();
std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
......
......@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <fstream>
#include <iostream>
#include "paddle/fluid/inference/tests/api/tester_helper.h"
......@@ -55,7 +56,7 @@ void SetConfig(AnalysisConfig *cfg) {
FLAGS_infer_model + "/__params__");
cfg->DisableGpu();
cfg->SwitchIrDebug();
cfg->SwitchSpecifyInputNames();
cfg->SwitchSpecifyInputNames(false);
// TODO(TJ): fix fusion gru
cfg->pass_builder()->DeletePass("fc_gru_fuse_pass");
}
......@@ -86,6 +87,7 @@ void profile(bool use_mkldnn = false) {
if (use_mkldnn) {
cfg.EnableMKLDNN();
}
// cfg.pass_builder()->TurnOnDebug();
std::vector<PaddleTensor> outputs;
std::vector<std::vector<PaddleTensor>> input_slots_all;
......@@ -103,9 +105,8 @@ void profile(bool use_mkldnn = false) {
size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
CHECK_EQ(numel, refer.data.size());
for (size_t i = 0; i < numel; ++i) {
CHECK_LT(
fabs(static_cast<float *>(output.data.data())[i] - refer.data[i]),
1e-5);
EXPECT_NEAR(static_cast<float *>(output.data.data())[i], refer.data[i],
1e-5);
}
}
}
......
......@@ -15,6 +15,7 @@
#pragma once
#include <gtest/gtest.h>
#include <algorithm>
#include <string>
#include <thread> // NOLINT
......@@ -28,9 +29,8 @@
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/analysis/ut_helper.h"
#include "paddle/fluid/inference/api/analysis_predictor.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/tests/api/config_printer.h"
#include "paddle/fluid/inference/tests/test_helper.h"
#include "paddle/fluid/inference/utils/benchmark.h"
......@@ -91,7 +91,7 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
float *pdata = static_cast<float *>(out.data.data());
float *pdata_ref = static_cast<float *>(ref_out.data.data());
for (size_t j = 0; j < size; ++j) {
EXPECT_NEAR(pdata_ref[j], pdata[j], FLAGS_accuracy);
CHECK_LE(std::abs(pdata_ref[j] - pdata[j]), FLAGS_accuracy);
}
break;
}
......
......@@ -157,5 +157,10 @@ TEST(AnalysisPredictor, use_gpu) {
}
}
TEST(TensorRT_mobilenet, profile) {
std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
profile(model_dir, true, false);
}
} // namespace inference
} // namespace paddle
......@@ -11,8 +11,8 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#pragma once
#include <fstream>
#include <iostream>
#include <string>
......
......@@ -16,7 +16,7 @@
#include <glog/logging.h>
#include <gtest/gtest.h>
using namespace paddle::inference;
using namespace paddle::inference; // NOLINT
TEST(Benchmark, basic) {
Benchmark benchmark;
benchmark.SetName("key0");
......@@ -36,4 +36,4 @@ TEST(Benchmark, PersistToFile) {
benchmark.PersistToFile("1.log");
benchmark.PersistToFile("1.log");
benchmark.PersistToFile("1.log");
}
\ No newline at end of file
}
......@@ -50,6 +50,7 @@ class FeedOp : public framework::OperatorBase {
<< out_name;
auto &feed_list = feed_var->Get<framework::FeedFetchList>();
PADDLE_ENFORCE_LT(static_cast<size_t>(col), feed_list.size());
auto &feed_item = feed_list.at(static_cast<size_t>(col));
auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
......
......@@ -66,5 +66,22 @@ static void PrettyLog(const std::string &style, const char *fmt,
std::cerr << style << Sprintf(fmt, args...) << reset();
}
template <typename... Args>
static void PrettyLogInfo(const char *fmt, const Args &... args) {
PrettyLogEndl(Style::info(), fmt, args...);
}
template <typename... Args>
static void PrettyLogDetail(const char *fmt, const Args &... args) {
PrettyLogEndl(Style::detail(), fmt, args...);
}
template <typename... Args>
static void PrettyLogH1(const char *fmt, const Args &... args) {
PrettyLogEndl(Style::H1(), fmt, args...);
}
template <typename... Args>
static void PrettyLogH2(const char *fmt, const Args &... args) {
PrettyLogEndl(Style::H2(), fmt, args...);
}
} // namespace string
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册