未验证 提交 b42ced8e 编写于 作者: Y Yan Chunwei 提交者: GitHub

bugfix/tensorrt analysis fix subgraph trigger (#12266)

上级 c5c17a14
...@@ -22,8 +22,6 @@ ...@@ -22,8 +22,6 @@
#include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h"
namespace paddle { namespace paddle {
namespace inference {
namespace analysis {
DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
"Enable subgraph to TensorRT engine for acceleration"); "Enable subgraph to TensorRT engine for acceleration");
...@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, ...@@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false,
DEFINE_string(inference_analysis_graphviz_log_root, "./", DEFINE_string(inference_analysis_graphviz_log_root, "./",
"Graphviz debuger for data flow graphs."); "Graphviz debuger for data flow graphs.");
namespace inference {
namespace analysis {
class DfgPassManagerImpl final : public DfgPassManager { class DfgPassManagerImpl final : public DfgPassManager {
public: public:
DfgPassManagerImpl() { DfgPassManagerImpl() {
......
...@@ -45,14 +45,15 @@ limitations under the License. */ ...@@ -45,14 +45,15 @@ limitations under the License. */
#include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/pass_manager.h"
namespace paddle { namespace paddle {
namespace inference {
namespace analysis {
// TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this
// flag if not available. // flag if not available.
DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine);
DECLARE_string(inference_analysis_graphviz_log_root); DECLARE_string(inference_analysis_graphviz_log_root);
namespace inference {
namespace analysis {
class Analyzer : public OrderedRegistry<PassManager> { class Analyzer : public OrderedRegistry<PassManager> {
public: public:
// Register all the pass-managers. // Register all the pass-managers.
......
...@@ -13,13 +13,21 @@ ...@@ -13,13 +13,21 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/analysis/analyzer.h"
#include <google/protobuf/text_format.h>
#include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/analysis/ut_helper.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
namespace analysis { namespace analysis {
TEST_F(DFG_Tester, main) { TEST_F(DFG_Tester, analysis_without_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false;
Analyzer analyser;
analyser.Run(&argument);
}
TEST_F(DFG_Tester, analysis_with_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true;
Analyzer analyser; Analyzer analyser;
analyser.Run(&argument); analyser.Run(&argument);
} }
......
...@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() { ...@@ -222,10 +222,19 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
return stack_.top(); return stack_.top();
} }
inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) {
return node.inlinks.size() == n;
}
GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator( GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
const std::vector<Node *> &source) { const std::vector<Node *> &source) {
PADDLE_ENFORCE(!source.empty(), PADDLE_ENFORCE(!source.empty(),
"Start points of topological sorting should not be empty!"); "Start points of topological sorting should not be empty!");
// CHECK all the inputs' in-degree is 0
for (auto *node : source) {
PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0));
}
std::unordered_set<Node *> visited; std::unordered_set<Node *> visited;
std::unordered_set<Node *> to_visit{source.begin(), source.end()}; std::unordered_set<Node *> to_visit{source.begin(), source.end()};
...@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator( ...@@ -233,6 +242,11 @@ GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
while (!to_visit.empty()) { while (!to_visit.empty()) {
std::vector<Node *> queue(to_visit.begin(), to_visit.end()); std::vector<Node *> queue(to_visit.begin(), to_visit.end());
for (auto *p : queue) { for (auto *p : queue) {
if (p->deleted()) {
visited.insert(p);
to_visit.erase(p);
continue;
}
inlink_visited.clear(); inlink_visited.clear();
std::copy_if(p->inlinks.begin(), p->inlinks.end(), std::copy_if(p->inlinks.begin(), p->inlinks.end(),
...@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() { ...@@ -292,6 +306,37 @@ Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
return sorted_[cursor_]; return sorted_[cursor_];
} }
std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
// Input a Value, check whether its inlink is in the subgraph.
auto inlink_in_subgraph = [&](Node *n) {
for (auto *in : n->inlinks) {
if (nodes.count(in)) return true;
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
// input of the sub-graph.
if (!nodes.count(in) && in->type() == Node::Type::kValue &&
!inlink_in_subgraph(in)) {
inputs.insert(in);
}
}
for (auto *out : node->outlinks) {
if (!nodes.count(out) && out->type() == Node::Type::kValue) {
outputs.insert(out);
}
}
}
return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
std::vector<Node *>(outputs.begin(), outputs.end()));
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
} // namespace paddle } // namespace paddle
...@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> { ...@@ -133,7 +133,7 @@ struct GraphTraits<DataFlowGraph> {
private: private:
std::vector<Node *> sorted_; std::vector<Node *> sorted_;
int cursor_{0}; size_t cursor_{0};
}; };
explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
...@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> { ...@@ -173,36 +173,8 @@ struct GraphTraits<DataFlowGraph> {
// Extract the inputs and outputs of a graph. The inputs and outputs of a // Extract the inputs and outputs of a graph. The inputs and outputs of a
// sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph is the inputs nodes and output nodes that doesn't inside the
// sub-graph. // sub-graph.
static std::pair<std::vector<Node *>, std::vector<Node *>> std::pair<std::vector<Node *>, std::vector<Node *>>
ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) { // NOLINT ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph);
std::unordered_set<Node *> nodes(graph.begin(), graph.end());
std::unordered_set<Node *> inputs;
std::unordered_set<Node *> outputs;
// Input a Value, check whether its inlink is in the subgraph.
auto inlink_in_subgraph = [&](Node *n) {
for (auto *in : n->inlinks) {
if (nodes.count(in)) return true;
}
return false;
};
for (auto &node : graph) {
for (auto *in : node->inlinks) {
// The Value that is written by nodes inside a sub-graph shouldn't be the
// input of the sub-graph.
if (!nodes.count(in) && in->type() == Node::Type::kValue &&
!inlink_in_subgraph(in)) {
inputs.insert(in);
}
}
for (auto *out : node->outlinks) {
if (!nodes.count(out) && out->type() == Node::Type::kValue) {
outputs.insert(out);
}
}
}
return std::make_pair(std::vector<Node *>(inputs.begin(), inputs.end()),
std::vector<Node *>(outputs.begin(), outputs.end()));
}
} // namespace analysis } // namespace analysis
} // namespace inference } // namespace inference
......
...@@ -22,14 +22,18 @@ ...@@ -22,14 +22,18 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size");
DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size");
namespace analysis { namespace analysis {
using framework::proto::ProgramDesc; using framework::proto::ProgramDesc;
std::vector<std::string> ExtractParameters( std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes); const std::vector<std::unique_ptr<Node>> &nodes);
bool DataFlowGraphToFluidPass::Initialize(Argument* argument) { bool DataFlowGraphToFluidPass::Initialize(Argument *argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument) ANALYSIS_ARGUMENT_CHECK_FIELD(argument)
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc) ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc)
PADDLE_ENFORCE(!argument->transformed_program_desc); PADDLE_ENFORCE(!argument->transformed_program_desc);
...@@ -47,32 +51,34 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) { ...@@ -47,32 +51,34 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) {
bool DataFlowGraphToFluidPass::Finalize() { return true; } bool DataFlowGraphToFluidPass::Finalize() { return true; }
void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) { void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) {
auto traits = GraphTraits<DataFlowGraph>(graph); LOG(INFO) << "graph.inputs " << graph->inputs.size();
for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) { for (auto &node : GraphTraits<DataFlowGraph>(graph).nodes_in_TS()) {
if (it->deleted()) continue; if (node.deleted()) continue;
switch (it->type()) { switch (node.type()) {
case Node::Type::kFunction: { case Node::Type::kFunction: {
LOG(INFO) << "add function " << it->repr(); LOG(INFO) << "add function " << node.repr();
AddFluidOp(&(*it)); AddFluidOp(&node);
} break; } break;
case Node::Type::kFunctionBlock: { case Node::Type::kFunctionBlock: {
LOG(INFO) << "add engine op " << it->repr() << " , " LOG(INFO) << "add engine op " << node.repr() << " , "
<< static_cast<FunctionBlock*>(&(*it))->subgraph.size(); << static_cast<FunctionBlock *>(&node)->subgraph.size();
AddEngineOp(&(*it)); AddEngineOp(&node);
} break; } break;
default: default:
continue; continue;
} }
} }
PADDLE_ENFORCE(argument_->transformed_program_desc.get());
} }
void DataFlowGraphToFluidPass::AddFluidOp(Node* node) { void DataFlowGraphToFluidPass::AddFluidOp(Node *node) {
auto* ori_op = static_cast<framework::proto::OpDesc*>(node->pb_desc()); auto *ori_op = static_cast<framework::proto::OpDesc *>(node->pb_desc());
// currently only the main block is analyzed. // currently only the main block is analyzed.
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops(); auto *op = main_block->add_ops();
*op = *ori_op; // copy the attributes, by default, these will not be changed *op = *ori_op; // copy the attributes, by default, these will not be changed
// by analysis phrase. // by analysis phrase.
// The inputs and outputs of the existing ops are not changed by tensorrt // The inputs and outputs of the existing ops are not changed by tensorrt
...@@ -80,43 +86,42 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) { ...@@ -80,43 +86,42 @@ void DataFlowGraphToFluidPass::AddFluidOp(Node* node) {
// NOTE It might be changed by other passes in the long run. // NOTE It might be changed by other passes in the long run.
} }
void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph, void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph,
const framework::proto::BlockDesc& block) { const framework::proto::BlockDesc &block) {
static int counter{0}; static int counter{0};
PADDLE_ENFORCE(node->IsFunctionBlock()); PADDLE_ENFORCE(node->IsFunctionBlock());
framework::OpDesc desc; framework::OpDesc desc;
auto* func = static_cast<FunctionBlock*>(node); auto *func = static_cast<FunctionBlock *>(node);
// collect inputs // collect inputs
std::vector<std::string> io; std::vector<std::string> io;
for (auto* x : func->inlinks) { for (auto *x : func->inlinks) {
io.push_back(x->name()); io.push_back(x->name());
} }
desc.SetInput("Xs", io); desc.SetInput("Xs", io);
// collect outputs // collect outputs
io.clear(); io.clear();
for (auto* x : func->outlinks) { for (auto *x : func->outlinks) {
io.push_back(x->name()); io.push_back(x->name());
} }
desc.SetOutput("Ys", io); desc.SetOutput("Ys", io);
desc.SetType("tensorrt_engine"); desc.SetType("tensorrt_engine");
PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc");
// Set attrs // Set attrs
SetAttr(desc.Proto(), "subgraph", block.SerializeAsString()); SetAttr(desc.Proto(), "subgraph", block.SerializeAsString());
SetAttr(desc.Proto(), "engine_unique_key", SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++));
"trt-" + std::to_string(counter++)); SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize);
SetAttr(desc.Proto(), "max_batch", 100); // TODO(Superjomn) add config latter SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size);
SetAttr(desc.Proto(), "max_workspace",
1024); // TODO(Superjomn) add config latter
SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes()));
node->SetPbMsg(desc.Proto()->SerializeAsString()); node->SetPbMsg(desc.Proto()->SerializeAsString());
} }
std::vector<std::string> ExtractParameters( std::vector<std::string> ExtractParameters(
const std::vector<std::unique_ptr<Node>>& nodes) { const std::vector<std::unique_ptr<Node>> &nodes) {
std::vector<std::string> parameters; std::vector<std::string> parameters;
for (const auto& node : nodes) { for (const auto &node : nodes) {
if (!node->IsValue()) continue; if (!node->IsValue()) continue;
PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first"); PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first");
framework::proto::VarDesc var; framework::proto::VarDesc var;
...@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters( ...@@ -128,21 +133,30 @@ std::vector<std::string> ExtractParameters(
return parameters; return parameters;
} }
void DataFlowGraphToFluidPass::AddEngineOp(Node* node) { void DataFlowGraphToFluidPass::AddEngineOp(Node *node) {
// TODO(Superjomn) Here need to expose some arguments for default setting. // TODO(Superjomn) Here need to expose some arguments for default setting.
PADDLE_ENFORCE(node->IsFunctionBlock()); PADDLE_ENFORCE(node->IsFunctionBlock());
auto* block_node = static_cast<FunctionBlock*>(node); auto *block_node = static_cast<FunctionBlock *>(node);
framework::proto::BlockDesc proto; framework::proto::BlockDesc proto;
framework::BlockDesc block_desc(nullptr, &proto); framework::BlockDesc block_desc(nullptr, &proto);
block_desc.Proto()->set_parent_idx(-1);
block_desc.Proto()->set_idx(0);
LOG(INFO) << "origin variable size: "
<< argument_->origin_program_desc->blocks(0).vars().size();
LOG(INFO) << "transformed variable size: "
<< block_desc.Proto()->vars().size();
// copy ops. // copy ops.
for (auto* node : block_node->subgraph) { for (auto *node : block_node->subgraph) {
auto* op = block_desc.AppendOp(); auto *op = block_desc.AppendOp();
PADDLE_ENFORCE(!node->pb_msg().empty()); PADDLE_ENFORCE(!node->pb_msg().empty());
op->Proto()->ParseFromString(node->pb_msg()); op->Proto()->ParseFromString(node->pb_msg());
} }
*block_desc.Proto()->mutable_vars() =
argument_->origin_program_desc->blocks(0).vars();
PADDLE_ENFORCE(!block_desc.Proto()->vars().empty());
CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto()); CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto());
auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex);
auto* op = main_block->add_ops(); auto *op = main_block->add_ops();
PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block");
op->ParseFromString(node->pb_msg()); op->ParseFromString(node->pb_msg());
} }
...@@ -151,7 +165,7 @@ namespace { ...@@ -151,7 +165,7 @@ namespace {
class DFG_DebuggerPass : public DFG_GraphvizDrawPass { class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
public: public:
using Config = DFG_GraphvizDrawPass::Config; using Config = DFG_GraphvizDrawPass::Config;
explicit DFG_DebuggerPass(const Config& config) explicit DFG_DebuggerPass(const Config &config)
: DFG_GraphvizDrawPass(config) {} : DFG_GraphvizDrawPass(config) {}
std::string repr() const override { return "dfg-to-fluid-debuger-pass"; } std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
...@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { ...@@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
}; };
} // namespace } // namespace
Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
FLAGS_inference_analysis_graphviz_log_root, FLAGS_inference_analysis_graphviz_log_root,
"data_flow_graph_to_fluid_graphviz_debugger")); "data_flow_graph_to_fluid_graphviz_debugger"));
......
...@@ -26,6 +26,10 @@ ...@@ -26,6 +26,10 @@
namespace paddle { namespace paddle {
namespace inference { namespace inference {
DECLARE_int32(tensorrt_max_batchsize);
DECLARE_int32(tensorrt_workspace_size);
namespace analysis { namespace analysis {
class DataFlowGraphToFluidPass final : public DataFlowGraphPass { class DataFlowGraphToFluidPass final : public DataFlowGraphPass {
public: public:
......
...@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { ...@@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) {
no++; no++;
} }
// DFG is sensitive to ProgramDesc, be careful to change the existing models. // DFG is sensitive to ProgramDesc, be careful to change the existing models.
ASSERT_EQ(no, 82); ASSERT_EQ(no, 83);
} }
} // namespace analysis } // namespace analysis
......
...@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { ...@@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) {
ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc);
PADDLE_ENFORCE(argument); PADDLE_ENFORCE(argument);
if (!argument->main_dfg) { if (!argument->main_dfg) {
LOG(INFO) << "Init DFG";
argument->main_dfg.reset(new DataFlowGraph); argument->main_dfg.reset(new DataFlowGraph);
} }
desc_ = argument->origin_program_desc.get(); desc_ = argument->origin_program_desc.get();
...@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { ...@@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
v->SetPbMsg(var.SerializeAsString()); v->SetPbMsg(var.SerializeAsString());
var2id[var.name()] = v->id(); var2id[var.name()] = v->id();
} }
for (int i = 0; i < main_block.ops_size(); i++) { for (int i = 0; i < main_block.ops_size(); i++) {
const auto &op = main_block.ops(i); const auto &op = main_block.ops(i);
auto *o = graph->nodes.Create(Node::Type::kFunction); auto *o = graph->nodes.Create(Node::Type::kFunction);
...@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { ...@@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) {
o->SetPbMsg(op.SerializeAsString()); o->SetPbMsg(op.SerializeAsString());
// set inputs and outputs // set inputs and outputs
// TODO(Superjomn) make sure the InputNames is the real variable name. std::unordered_set<Node *> inlinks;
for (int j = 0; j < op.inputs_size(); j++) { for (int j = 0; j < op.inputs_size(); j++) {
auto &in_var = op.inputs(j); auto &in_var = op.inputs(j);
for (int k = 0; k < in_var.arguments_size(); k++) { for (int k = 0; k < in_var.arguments_size(); k++) {
auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k)));
in->outlinks.push_back(o); in->outlinks.push_back(o);
o->inlinks.push_back(in); o->inlinks.push_back(in);
inlinks.insert(in);
} }
} }
for (int j = 0; j < op.outputs_size(); j++) { for (int j = 0; j < op.outputs_size(); j++) {
auto &out_var = op.outputs(j); auto &out_var = op.outputs(j);
for (int k = 0; k < out_var.arguments_size(); k++) { for (int k = 0; k < out_var.arguments_size(); k++) {
auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]);
if (inlinks.count(out)) {
// Loop found, for example, a = op(a), use SSA, change to a1 = op(a).
auto *out_alias = graph->nodes.Create(Node::Type::kValue);
out_alias->SetName(out->name());
out_alias->SetPbDesc(out->pb_desc());
out_alias->SetPbMsg(out->pb_msg());
var2id[out_alias->name()] = out_alias->id(); // update a -> a0
LOG(INFO) << "loop found in graph, create SSA alias node ["
<< out_alias->repr() << "] for [" << out->repr() << "]";
out = out_alias;
}
out->inlinks.push_back(o); out->inlinks.push_back(o);
o->outlinks.push_back(out); o->outlinks.push_back(out);
} }
......
...@@ -24,12 +24,12 @@ namespace analysis { ...@@ -24,12 +24,12 @@ namespace analysis {
TEST_F(DFG_Tester, Init) { TEST_F(DFG_Tester, Init) {
FluidToDataFlowGraphPass pass; FluidToDataFlowGraphPass pass;
pass.Initialize(&argument); pass.Initialize(&argument);
DataFlowGraph graph; pass.Run(argument.main_dfg.get());
pass.Run(&graph);
// Analysis is sensitive to ProgramDesc, careful to change the original model. // Analysis is sensitive to ProgramDesc, careful to change the original model.
ASSERT_EQ(graph.nodes.size(), 37UL); ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL);
pass.Finalize(); pass.Finalize();
LOG(INFO) << '\n' << graph.DotString(); ASSERT_FALSE(argument.main_dfg->DotString().empty());
EXPECT_FALSE(argument.main_dfg->inputs.empty());
} }
} // namespace analysis } // namespace analysis
......
...@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( ...@@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass(
void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { void TensorRTSubGraphPass::Run(DataFlowGraph *graph) {
SubGraphFuse(graph, node_inside_subgraph_teller_)(); SubGraphFuse(graph, node_inside_subgraph_teller_)();
VLOG(4) << "debug info "
<< graph->HumanReadableInfo(false /*show_values*/,
true /*show_functions*/);
} }
} // namespace analysis } // namespace analysis
......
...@@ -82,7 +82,7 @@ inference_api_test(test_api_impl ...@@ -82,7 +82,7 @@ inference_api_test(test_api_impl
if(WITH_GPU AND TENSORRT_FOUND) if(WITH_GPU AND TENSORRT_FOUND)
cc_library(paddle_inference_tensorrt_subgraph_engine cc_library(paddle_inference_tensorrt_subgraph_engine
SRCS api_tensorrt_subgraph_engine.cc SRCS api_tensorrt_subgraph_engine.cc
DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api) DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter)
inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec) inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec)
endif() endif()
......
...@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { ...@@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) {
bool PaddleInferenceAnakinPredictor::Run( bool PaddleInferenceAnakinPredictor::Run(
const std::vector<PaddleTensor> &inputs, const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) { std::vector<PaddleTensor> *output_data, int batch_size) {
for (const auto &input : inputs) { for (const auto &input : inputs) {
if (input.dtype != PaddleDType::FLOAT32) { if (input.dtype != PaddleDType::FLOAT32) {
LOG(ERROR) << "Only support float type inputs. " << input.name LOG(ERROR) << "Only support float type inputs. " << input.name
......
...@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { ...@@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor {
// NOTE Unlike the native engine, the buffers of anakin engine's output_data // NOTE Unlike the native engine, the buffers of anakin engine's output_data
// should be allocated first. // should be allocated first.
bool Run(const std::vector<PaddleTensor>& inputs, bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) override; std::vector<PaddleTensor>* output_data,
int batch_size = -1) override;
std::unique_ptr<PaddlePredictor> Clone() override; std::unique_ptr<PaddlePredictor> Clone() override;
......
...@@ -108,7 +108,8 @@ NativePaddlePredictor::~NativePaddlePredictor() { ...@@ -108,7 +108,8 @@ NativePaddlePredictor::~NativePaddlePredictor() {
} }
bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs, bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) { std::vector<PaddleTensor> *output_data,
int batch_size) {
VLOG(3) << "Predictor::predict"; VLOG(3) << "Predictor::predict";
Timer timer; Timer timer;
timer.tic(); timer.tic();
......
...@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor { ...@@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor {
bool Init(std::shared_ptr<framework::Scope> parent_scope); bool Init(std::shared_ptr<framework::Scope> parent_scope);
bool Run(const std::vector<PaddleTensor> &inputs, bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override; std::vector<PaddleTensor> *output_data,
int batch_size = -1) override;
std::unique_ptr<PaddlePredictor> Clone() override; std::unique_ptr<PaddlePredictor> Clone() override;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/api_impl.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle { namespace paddle {
...@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
return false; return false;
} }
// Analyze inference_program OptimizeInferenceProgram();
Argument argument;
argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
*inference_program_->Proto() = *argument.transformed_program_desc;
ctx_ = executor_->Prepare(*inference_program_, 0); ctx_ = executor_->Prepare(*inference_program_, 0);
VLOG(5) << "to create variables"; VLOG(5) << "to create variables";
...@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { ...@@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
return true; return true;
} }
bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data,
int batch_size = -1) override {
PADDLE_ENFORCE_GT(batch_size, 0,
"TensorRT engine needs the argument batch_size set");
FLAGS_tensorrt_engine_batch_size = batch_size;
return NativePaddlePredictor::Run(inputs, output_data, batch_size);
}
void OptimizeInferenceProgram() {
// Analyze inference_program
Argument argument;
argument.origin_program_desc.reset(
new ProgramDesc(*inference_program_->Proto()));
Singleton<Analyzer>::Global().Run(&argument);
CHECK(argument.transformed_program_desc);
VLOG(5) << "transformed program:\n"
<< argument.transformed_program_desc->SerializeAsString();
VLOG(5) << "to prepare executor";
inference_program_.reset(
new framework::ProgramDesc(*argument.transformed_program_desc));
}
private: private:
TensorRTConfig config_; TensorRTConfig config_;
}; };
......
...@@ -98,7 +98,8 @@ class PaddlePredictor { ...@@ -98,7 +98,8 @@ class PaddlePredictor {
// responsible for the output tensor's buffer, either allocated or passed from // responsible for the output tensor's buffer, either allocated or passed from
// outside. // outside.
virtual bool Run(const std::vector<PaddleTensor>& inputs, virtual bool Run(const std::vector<PaddleTensor>& inputs,
std::vector<PaddleTensor>* output_data) = 0; std::vector<PaddleTensor>* output_data,
int batch_size = -1) = 0;
// Clone a predictor that share the model weights, the Cloned predictor should // Clone a predictor that share the model weights, the Cloned predictor should
// be thread-safe. // be thread-safe.
......
...@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor { ...@@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor {
LOG(INFO) << "I get other_config " << config.other_config; LOG(INFO) << "I get other_config " << config.other_config;
} }
bool Run(const std::vector<PaddleTensor> &inputs, bool Run(const std::vector<PaddleTensor> &inputs,
std::vector<PaddleTensor> *output_data) override { std::vector<PaddleTensor> *output_data,
int batch_size = 0) override {
LOG(INFO) << "Run"; LOG(INFO) << "Run";
return false; return false;
} }
......
...@@ -15,50 +15,79 @@ ...@@ -15,50 +15,79 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <glog/logging.h> #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/fluid/inference/analysis/analyzer.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_api.h"
namespace paddle { namespace paddle {
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
void Main(bool use_gpu) { void CompareTensorRTWithFluid(bool enable_tensorrt) {
FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt;
//# 1. Create PaddlePredictor with a config. //# 1. Create PaddlePredictor with a config.
TensorRTConfig config; NativeConfig config0;
config.model_dir = FLAGS_dirname + "word2vec.inference.model"; config0.model_dir = FLAGS_dirname + "word2vec.inference.model";
config.use_gpu = use_gpu; config0.use_gpu = true;
config.fraction_of_gpu_memory = 0.15; config0.fraction_of_gpu_memory = 0.3;
config.device = 0; config0.device = 0;
auto predictor =
TensorRTConfig config1;
config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
config1.use_gpu = true;
config1.fraction_of_gpu_memory = 0.3;
config1.device = 0;
auto predictor0 =
CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
auto predictor1 =
CreatePaddlePredictor<TensorRTConfig, CreatePaddlePredictor<TensorRTConfig,
PaddleEngineKind::kAutoMixedTensorRT>(config); PaddleEngineKind::kAutoMixedTensorRT>(config1);
for (int batch_id = 0; batch_id < 3; batch_id++) { for (int batch_id = 0; batch_id < 1; batch_id++) {
//# 2. Prepare input. //# 2. Prepare input.
int64_t data[4] = {1, 2, 3, 4}; std::vector<int64_t> data(20);
for (int i = 0; i < 20; i++) data[i] = i;
PaddleTensor tensor{.name = "", PaddleTensor tensor{
.shape = std::vector<int>({4, 1}), .name = "",
.data = PaddleBuf(data, sizeof(data)), .shape = std::vector<int>({10, 1}),
.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)),
.dtype = PaddleDType::INT64}; .dtype = PaddleDType::INT64};
// For simplicity, we set all the slots with the same data. // For simplicity, we set all the slots with the same data.
std::vector<PaddleTensor> slots(4, tensor); std::vector<PaddleTensor> slots(4, tensor);
//# 3. Run //# 3. Run
std::vector<PaddleTensor> outputs; std::vector<PaddleTensor> outputs0;
CHECK(predictor->Run(slots, &outputs)); std::vector<PaddleTensor> outputs1;
CHECK(predictor0->Run(slots, &outputs0));
CHECK(predictor1->Run(slots, &outputs1, 10));
//# 4. Get output. //# 4. Get output.
ASSERT_EQ(outputs.size(), 1UL); ASSERT_EQ(outputs0.size(), 1UL);
LOG(INFO) << "output buffer size: " << outputs.front().data.length(); ASSERT_EQ(outputs1.size(), 1UL);
const size_t num_elements = outputs.front().data.length() / sizeof(float);
// The outputs' buffers are in CPU memory. const size_t num_elements = outputs0.front().data.length() / sizeof(float);
for (size_t i = 0; i < std::min(5UL, num_elements); i++) { const size_t num_elements1 = outputs1.front().data.length() / sizeof(float);
LOG(INFO) << static_cast<float*>(outputs.front().data.data())[i]; EXPECT_EQ(num_elements, num_elements1);
auto *data0 = static_cast<float *>(outputs0.front().data.data());
auto *data1 = static_cast<float *>(outputs1.front().data.data());
ASSERT_GT(num_elements, 0UL);
for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) {
EXPECT_NEAR(data0[i], data1[i], 1e-3);
} }
} }
} }
TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); } TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) {
CompareTensorRTWithFluid(false);
}
TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) {
CompareTensorRTWithFluid(true);
}
} // namespace paddle } // namespace paddle
...@@ -93,6 +93,10 @@ class OpConverter { ...@@ -93,6 +93,10 @@ class OpConverter {
framework::Scope* scope_{nullptr}; framework::Scope* scope_{nullptr};
}; };
} // namespace tensorrt
} // namespace inference
} // namespace paddle
#define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \
struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \ struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \
trt_##op_type__##_converter() { \ trt_##op_type__##_converter() { \
...@@ -111,7 +115,3 @@ class OpConverter { ...@@ -111,7 +115,3 @@ class OpConverter {
extern int TouchConverterRegister_##op_type__(); \ extern int TouchConverterRegister_##op_type__(); \
static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \ static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \
TouchConverterRegister_##op_type__(); TouchConverterRegister_##op_type__();
} // namespace tensorrt
} // namespace inference
} // namespace paddle
...@@ -26,18 +26,20 @@ namespace paddle { ...@@ -26,18 +26,20 @@ namespace paddle {
namespace inference { namespace inference {
namespace tensorrt { namespace tensorrt {
void TensorRTEngine::Build(const DescType& paddle_model) { void TensorRTEngine::Build(const DescType &paddle_model) {
PADDLE_ENFORCE(false, "not implemented"); PADDLE_ENFORCE(false, "not implemented");
} }
void TensorRTEngine::Execute(int batch_size) { void TensorRTEngine::Execute(int batch_size) {
std::vector<void*> buffers; batch_size_ = batch_size;
for (auto& buf : buffers_) { std::vector<void *> buffers;
for (auto &buf : buffers_) {
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
PADDLE_ENFORCE_GT(buf.max_size, 0); PADDLE_ENFORCE_GT(buf.max_size, 0);
PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buffers.push_back(buf.buffer); buffers.push_back(buf.buffer);
} }
PADDLE_ENFORCE_NOT_NULL(stream_);
infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr);
cudaStreamSynchronize(*stream_); cudaStreamSynchronize(*stream_);
} }
...@@ -45,7 +47,7 @@ void TensorRTEngine::Execute(int batch_size) { ...@@ -45,7 +47,7 @@ void TensorRTEngine::Execute(int batch_size) {
TensorRTEngine::~TensorRTEngine() { TensorRTEngine::~TensorRTEngine() {
cudaStreamSynchronize(*stream_); cudaStreamSynchronize(*stream_);
// clean buffer // clean buffer
for (auto& buf : buffers_) { for (auto &buf : buffers_) {
if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
buf.buffer = nullptr; buf.buffer = nullptr;
...@@ -70,32 +72,37 @@ void TensorRTEngine::FreezeNetwork() { ...@@ -70,32 +72,37 @@ void TensorRTEngine::FreezeNetwork() {
// allocate GPU buffers. // allocate GPU buffers.
buffers_.resize(buffer_sizes_.size()); buffers_.resize(buffer_sizes_.size());
for (auto& item : buffer_sizes_) { for (auto &item : buffer_sizes_) {
// The output buffers are not set in the network building phrase, need to
// infer from the TesorRT network.
if (item.second == 0) { if (item.second == 0) {
auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str()); auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset); auto dims = infer_engine_->getBindingDimensions(slot_offset);
item.second = kDataTypeSize[static_cast<int>( item.second = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] * infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims);
PADDLE_ENFORCE_GT(item.second, 0);
} }
auto& buf = buffer(item.first);
auto &buf = buffer(item.first);
buf.max_size = item.second * max_batch_;
CHECK(buf.buffer == nullptr); // buffer should be allocated only once. CHECK(buf.buffer == nullptr); // buffer should be allocated only once.
PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second)); PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size));
VLOG(4) << "buffer malloc " << item.first << " " << item.second << " " PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G
<< buf.buffer; // buf.size will changed in the runtime.
buf.size = buf.max_size = item.second; buf.size = 0;
buf.device = DeviceType::GPU; buf.device = DeviceType::GPU;
} }
} }
nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
nvinfer1::DataType dtype, nvinfer1::DataType dtype,
const nvinfer1::Dims& dims) { const nvinfer1::Dims &dims) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s", PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s",
name); name);
PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first"); PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first");
auto* input = infer_network_->addInput(name.c_str(), dtype, dims); auto *input = infer_network_->addInput(name.c_str(), dtype, dims);
PADDLE_ENFORCE(input, "infer network add input %s failed", name); PADDLE_ENFORCE(input, "infer network add input %s failed", name);
buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] * buffer_sizes_[name] = kDataTypeSize[static_cast<int>(dtype)] *
analysis::AccuDims(dims.d, dims.nbDims); analysis::AccuDims(dims.d, dims.nbDims);
...@@ -104,12 +111,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, ...@@ -104,12 +111,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name,
return input; return input;
} }
void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset,
const std::string& name) { const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name); name);
auto* output = layer->getOutput(offset); auto *output = layer->getOutput(offset);
SetITensor(name, output); SetITensor(name, output);
PADDLE_ENFORCE(output != nullptr); PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str()); output->setName(name.c_str());
...@@ -121,11 +128,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, ...@@ -121,11 +128,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset,
buffer_sizes_[name] = 0; buffer_sizes_[name] = 0;
} }
void TensorRTEngine::DeclareOutput(const std::string& name) { void TensorRTEngine::DeclareOutput(const std::string &name) {
PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s",
name); name);
auto* output = TensorRTEngine::GetITensor(name); auto *output = TensorRTEngine::GetITensor(name);
PADDLE_ENFORCE(output != nullptr); PADDLE_ENFORCE(output != nullptr);
output->setName(name.c_str()); output->setName(name.c_str());
PADDLE_ENFORCE(!output->isNetworkInput()); PADDLE_ENFORCE(!output->isNetworkInput());
...@@ -135,38 +142,45 @@ void TensorRTEngine::DeclareOutput(const std::string& name) { ...@@ -135,38 +142,45 @@ void TensorRTEngine::DeclareOutput(const std::string& name) {
buffer_sizes_[name] = 0; buffer_sizes_[name] = 0;
} }
void* TensorRTEngine::GetOutputInGPU(const std::string& name) { void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
return buffer(name).buffer; return buffer(name).buffer;
} }
void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst, void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
size_t max_size) { size_t max_size) {
// determine data size // determine data size
auto it = buffer_sizes_.find(name); auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second); PADDLE_ENFORCE_GE(max_size, it->second);
auto& buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second,
cudaMemcpyDeviceToDevice, *stream_), cudaMemcpyDeviceToDevice, *stream_),
0); 0);
} }
void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst, void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
size_t max_size) { size_t max_size) {
VLOG(4) << "get output in cpu";
auto &buf = buffer(name);
// Update needed buffer size.
auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
auto dims = infer_engine_->getBindingDimensions(slot_offset);
buf.size = kDataTypeSize[static_cast<int>(
infer_engine_->getBindingDataType(slot_offset))] *
analysis::AccuDims(dims.d, dims.nbDims);
PADDLE_ENFORCE_LE(buf.size, buf.max_size);
// determine data size // determine data size
auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end());
PADDLE_ENFORCE_GT(it->second, 0);
PADDLE_ENFORCE_GE(max_size, it->second);
auto& buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second, // DEBUG
cudaMemcpyDeviceToHost, *stream_)); memset(dst, 0, buf.size);
PADDLE_ENFORCE_EQ(
0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost));
} }
Buffer& TensorRTEngine::buffer(const std::string& name) { Buffer &TensorRTEngine::buffer(const std::string &name) {
PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first."); PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
auto it = buffer_sizes_.find(name); auto it = buffer_sizes_.find(name);
PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE(it != buffer_sizes_.end());
...@@ -174,19 +188,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) { ...@@ -174,19 +188,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) {
return buffers_[slot_offset]; return buffers_[slot_offset];
} }
void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data, void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
size_t size) { size_t size) {
auto& buf = buffer(name); auto &buf = buffer(name);
PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_NOT_NULL(data);
PADDLE_ENFORCE_NOT_NULL(stream_);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE(buf.device == DeviceType::GPU);
buf.size = size;
PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
cudaMemcpyHostToDevice, *stream_)); cudaMemcpyHostToDevice, *stream_));
} }
void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data, void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
size_t size) { size_t size) {
auto& buf = buffer(name); auto &buf = buffer(name);
buf.size = size;
PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_NOT_NULL(buf.buffer);
PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
PADDLE_ENFORCE(buf.device == DeviceType::GPU); PADDLE_ENFORCE(buf.device == DeviceType::GPU);
...@@ -194,15 +212,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data, ...@@ -194,15 +212,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data,
cudaMemcpyDeviceToDevice, *stream_)); cudaMemcpyDeviceToDevice, *stream_));
} }
void TensorRTEngine::SetITensor(const std::string& name, void TensorRTEngine::SetITensor(const std::string &name,
nvinfer1::ITensor* tensor) { nvinfer1::ITensor *tensor) {
PADDLE_ENFORCE(tensor != nullptr); PADDLE_ENFORCE(tensor != nullptr);
PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s", PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s",
name); name);
itensor_map_[name] = tensor; itensor_map_[name] = tensor;
} }
nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) { nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name); PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name);
return itensor_map_[name]; return itensor_map_[name];
} }
......
...@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase { ...@@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase {
: max_batch_(max_batch), : max_batch_(max_batch),
max_workspace_(max_workspace), max_workspace_(max_workspace),
stream_(stream ? stream : &default_stream_), stream_(stream ? stream : &default_stream_),
logger_(logger) {} logger_(logger) {
cudaStreamCreate(&default_stream_);
}
virtual ~TensorRTEngine(); virtual ~TensorRTEngine();
...@@ -121,6 +123,9 @@ class TensorRTEngine : public EngineBase { ...@@ -121,6 +123,9 @@ class TensorRTEngine : public EngineBase {
int max_batch_; int max_batch_;
// the max memory size the engine uses // the max memory size the engine uses
int max_workspace_; int max_workspace_;
// batch size of the current data, will be updated each Executation.
int batch_size_{-1};
cudaStream_t* stream_; cudaStream_t* stream_;
// If stream_ is not set from outside, hold its own stream. // If stream_ is not set from outside, hold its own stream.
cudaStream_t default_stream_; cudaStream_t default_stream_;
......
...@@ -103,6 +103,10 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ...@@ -103,6 +103,10 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
LOG(INFO) << "to get output"; LOG(INFO) << "to get output";
float y_cpu[2] = {-1., -1.}; float y_cpu[2] = {-1., -1.};
auto dims = engine_->GetITensor("y")->getDimensions();
ASSERT_EQ(dims.nbDims, 3);
ASSERT_EQ(dims.d[0], 2);
ASSERT_EQ(dims.d[1], 1);
engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2);
ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[0], 4.5);
ASSERT_EQ(y_cpu[1], 14.5); ASSERT_EQ(y_cpu[1], 14.5);
......
...@@ -168,6 +168,8 @@ function(op_library TARGET) ...@@ -168,6 +168,8 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(relu);\n") file(APPEND ${pybind_file} "USE_OP(relu);\n")
elseif(${TARGET} STREQUAL "fake_dequantize") elseif(${TARGET} STREQUAL "fake_dequantize")
file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
elseif(${TARGET} STREQUAL "tensorrt_engine_op")
message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
else() else()
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
endif() endif()
...@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) ...@@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax) op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax)
if (WITH_GPU AND TENSORRT_FOUND) if (WITH_GPU AND TENSORRT_FOUND)
op_library(tensorrt_engine_op DEPS tensorrt_engine) op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter)
nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc
DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter DEPS tensorrt_engine_op
analysis) analysis)
else() else()
set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
......
...@@ -24,6 +24,9 @@ ...@@ -24,6 +24,9 @@
#include "paddle/fluid/operators/tensorrt_engine_op.h" #include "paddle/fluid/operators/tensorrt_engine_op.h"
namespace paddle { namespace paddle {
DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT");
namespace operators { namespace operators {
using inference::Singleton; using inference::Singleton;
...@@ -52,7 +55,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) { ...@@ -52,7 +55,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
"TensorRT' tensor input requires at least 2 dimensions"); "TensorRT' tensor input requires at least 2 dimensions");
PADDLE_ENFORCE_LE(shape.size(), 4UL, PADDLE_ENFORCE_LE(shape.size(), 4UL,
"TensorRT' tensor input requires at most 4 dimensions"); "TensorRT' tensor input requires at most 4 dimensions");
switch (shape.size()) { switch (shape.size()) {
case 2: case 2:
return nvinfer1::Dims2(shape[0], shape[1]); return nvinfer1::Dims2(shape[0], shape[1]);
...@@ -90,27 +92,36 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare( ...@@ -90,27 +92,36 @@ void TensorRTEngineKernel<DeviceContext, T>::Prepare(
engine->InitNetwork(); engine->InitNetwork();
framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
VLOG(4) << "parsed var size " << block.AllVars().size();
// Add inputs // Add inputs
VLOG(4) << "declare inputs"; VLOG(4) << "declare inputs";
for (auto &input : context.Inputs("Xs")) { for (auto &input : context.Inputs("Xs")) {
VLOG(4) << "declare input " << input; VLOG(4) << "declare input " << input;
auto *var = block.FindVar(input); auto *var = block.FindVar(input);
// TensorRT engine need to create parameters. The parameter's description
// should be set in
PADDLE_ENFORCE(var, "no variable called %s", input);
PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
"TensorRT engine only takes LoDTensor as input"); "TensorRT engine only takes LoDTensor as input");
auto shape = var->GetShape(); auto shape = var->GetShape();
// For the special batch_size placeholder -1, drop it and pass the real
// shape of data.
// TODO(Superjomn) fix this with batch broadcast, or it can't handle
// variational batch size.
if (shape[0] == -1) {
shape[0] = FLAGS_tensorrt_engine_batch_size;
}
engine->DeclareInput( engine->DeclareInput(
input, FluidDataType2TRT( input, FluidDataType2TRT(
var->Proto()->type().lod_tensor().tensor().data_type()), var->Proto()->type().lod_tensor().tensor().data_type()),
Vec2TRT_Dims(var->GetShape())); Vec2TRT_Dims(shape));
} }
inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock( inference::Singleton<inference::tensorrt::OpConverter>::Global().ConvertBlock(
block_desc, parameters, context.scope(), engine); block_desc, parameters, context.scope(), engine);
// Add outputs // Add outputs
VLOG(4) << "declare outputs";
for (auto &output : context.Outputs("Ys")) { for (auto &output : context.Outputs("Ys")) {
VLOG(4) << "declare output " << output;
engine->DeclareOutput(output); engine->DeclareOutput(output);
} }
...@@ -151,4 +162,7 @@ REGISTER_OP_CPU_KERNEL( ...@@ -151,4 +162,7 @@ REGISTER_OP_CPU_KERNEL(
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>, ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>); ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
// A trick to compile with the needed TensorRT op converter.
USE_TRT_CONVERTER(mul)
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -24,6 +24,9 @@ ...@@ -24,6 +24,9 @@
#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/engine.h"
namespace paddle { namespace paddle {
DECLARE_int32(tensorrt_engine_batch_size);
namespace operators { namespace operators {
using inference::Singleton; using inference::Singleton;
...@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T> ...@@ -53,7 +56,6 @@ template <typename DeviceContext, typename T>
class TensorRTEngineKernel : public framework::OpKernel<T> { class TensorRTEngineKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
VLOG(4) << "TensorRTEngineKernel executing";
auto engine_name = context.Attr<std::string>("engine_uniq_key"); auto engine_name = context.Attr<std::string>("engine_uniq_key");
if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) { if (!Singleton<TRT_EngineManager>::Global().HasEngine(engine_name)) {
Prepare(context); Prepare(context);
...@@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name); auto* engine = Singleton<TRT_EngineManager>::Global().Get(engine_name);
auto input_names = context.op().Inputs("Xs"); auto input_names = context.op().Inputs("Xs");
PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs");
// Try to determine a batch_size PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size,
auto& tensor0 = inference::analysis::GetFromScope<framework::LoDTensor>( context.Attr<int>("max_batch"));
context.scope(), input_names.front());
int batch_size = tensor0.dims()[0];
PADDLE_ENFORCE_LE(batch_size, context.Attr<int>("max_batch"));
// Convert input tensor from fluid to engine. // Convert input tensor from fluid to engine.
for (const auto& x : context.Inputs("Xs")) { for (const auto& x : context.Inputs("Xs")) {
...@@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
} }
} }
// Execute the engine. // Execute the engine.
PADDLE_ENFORCE_GT(batch_size, 0); PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0);
engine->Execute(batch_size); engine->Execute(FLAGS_tensorrt_engine_batch_size);
// Convert output tensor from engine to fluid // Convert output tensor from engine to fluid
for (const auto& y : context.Outputs("Ys")) { for (const auto& y : context.Outputs("Ys")) {
// convert output and copy to fluid. // convert output and copy to fluid.
...@@ -94,18 +93,21 @@ class TensorRTEngineKernel : public framework::OpKernel<T> { ...@@ -94,18 +93,21 @@ class TensorRTEngineKernel : public framework::OpKernel<T> {
auto* fluid_v = context.scope().FindVar(y); auto* fluid_v = context.scope().FindVar(y);
PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>(); auto* fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
fluid_t->Resize(framework::make_ddim(ddim));
auto size = inference::analysis::AccuDims(dims.d, dims.nbDims); auto size = inference::analysis::AccuDims(dims.d, dims.nbDims);
if (platform::is_cpu_place(fluid_t->place())) { fluid_t->Resize(framework::make_ddim(ddim));
// TODO(Superjomn) find some way to determine which device to output the
// tensor.
// if (platform::is_cpu_place(fluid_t->place())) {
// TODO(Superjomn) change this float to dtype size. // TODO(Superjomn) change this float to dtype size.
engine->GetOutputInCPU( engine->GetOutputInCPU(y,
y, fluid_t->mutable_data<float>(platform::CPUPlace()), fluid_t->mutable_data<float>(platform::CPUPlace()),
size * sizeof(float));
} else {
engine->GetOutputInGPU(
y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
size * sizeof(float)); size * sizeof(float));
} //} else {
// engine->GetOutputInGPU(
// y, fluid_t->mutable_data<float>(platform::CUDAPlace()),
// size * sizeof(float));
//}
} }
cudaStreamSynchronize(*engine->stream()); cudaStreamSynchronize(*engine->stream());
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册