diff --git a/benchmark/paddle/image/run.sh b/benchmark/paddle/image/run.sh index 717ed487ba7657db6535efcb1128a355a0f15eaf..5b58a8d773aab795e5439b0f0e5d81bec66b5f56 100755 --- a/benchmark/paddle/image/run.sh +++ b/benchmark/paddle/image/run.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function train() { diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh index 62c9bf6efd3810f506fd4592b2ba3a21b1b7f0e7..0fad5e04cc992a3ec97591d3833957bb7517a8f3 100755 --- a/benchmark/paddle/image/run_mkl_infer.sh +++ b/benchmark/paddle/image/run_mkl_infer.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function clock_to_seconds() { diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh index 03d2d378fb72e36f765d89af788f6ee96fe21d4e..1583bf134a276a08aa2f8e84dc63adbb205a83d6 100755 --- a/benchmark/paddle/image/run_mkl_train.sh +++ b/benchmark/paddle/image/run_mkl_train.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function train() { diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh index a9a7b8a66717c4be0543c3fe2db293fe199e3dc4..987381cabc2e793886099212660723c122b73bb0 100755 --- a/benchmark/paddle/image/run_openblas_infer.sh +++ b/benchmark/paddle/image/run_openblas_infer.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function clock_to_seconds() { diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh index 935cff6f2c97d25d6de556cfee25e27dbe49b5b6..cc64e1d09da02087b1737190a0b75dc7758600a6 100755 --- a/benchmark/paddle/image/run_openblas_train.sh +++ b/benchmark/paddle/image/run_openblas_train.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function train() { diff --git a/benchmark/paddle/rnn/run.sh b/benchmark/paddle/rnn/run.sh index e9dfeb2e525979f47e4ef48f7610dc1007900f2c..f99a562b3f88a98560f4bf7aee98ceee9daefe67 100755 --- a/benchmark/paddle/rnn/run.sh +++ b/benchmark/paddle/rnn/run.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function train() { diff --git a/benchmark/tensorflow/image/run.sh b/benchmark/tensorflow/image/run.sh index eade36beb9df5f8d3978939216e058203e024c1a..cf894fe3f2dca24e3acf863d625b3a7008793b83 100755 --- a/benchmark/tensorflow/image/run.sh +++ b/benchmark/tensorflow/image/run.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function test() { diff --git a/benchmark/tensorflow/image/run_multi.sh b/benchmark/tensorflow/image/run_multi.sh index 69faa4331744f2276e7706185ae10bc507f95764..bf1435bc55b90669e0b8bd893b8ed7bbb99d51e2 100755 --- a/benchmark/tensorflow/image/run_multi.sh +++ b/benchmark/tensorflow/image/run_multi.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function test() { diff --git a/benchmark/tensorflow/rnn/run.sh b/benchmark/tensorflow/rnn/run.sh index bb4c69cb95f965eff35f1c5a60376bf1e84f841b..db10eefdea8676ad34fb84a161f0fc1309147824 100755 --- a/benchmark/tensorflow/rnn/run.sh +++ b/benchmark/tensorflow/rnn/run.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function test() { diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh index c2d7dd597e6da54cd5c4cda311fbbd18486b4647..ec62fc26b51543f2f8ddfc5e73aa6ff7d611e4dd 100755 --- a/benchmark/tensorflow/rnn/run_multi.sh +++ b/benchmark/tensorflow/rnn/run_multi.sh @@ -1,3 +1,5 @@ +#!/bin/bash + set -e function test() { diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index a4625f008c15300b88ef0bce71cd7d8aa473c9a8..b3a1075e5adf4a24bf32017574c061f36c46ba8c 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -22,8 +22,6 @@ #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" namespace paddle { -namespace inference { -namespace analysis { DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, "Enable subgraph to TensorRT engine for acceleration"); @@ -31,6 +29,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, DEFINE_string(inference_analysis_graphviz_log_root, "./", "Graphviz debuger for data flow graphs."); +namespace inference { +namespace analysis { + class DfgPassManagerImpl final : public DfgPassManager { public: DfgPassManagerImpl() { diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index e9e14fb1947da059c8d126d3da182ce446f6421e..0132bf5b9c6552391aaa19542669487f42b685a7 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -45,14 +45,15 @@ limitations under the License. */ #include "paddle/fluid/inference/analysis/pass_manager.h" namespace paddle { -namespace inference { -namespace analysis { // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this // flag if not available. DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_string(inference_analysis_graphviz_log_root); +namespace inference { +namespace analysis { + class Analyzer : public OrderedRegistry { public: // Register all the pass-managers. diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index d7c1a72932a39f878add2bb884e280b91d3c38c0..25a440e7e71fddb38cc515f99d15231675a8172e 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -13,13 +13,21 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/analyzer.h" +#include #include "paddle/fluid/inference/analysis/ut_helper.h" namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, main) { +TEST_F(DFG_Tester, analysis_without_tensorrt) { + FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; + Analyzer analyser; + analyser.Run(&argument); +} + +TEST_F(DFG_Tester, analysis_with_tensorrt) { + FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; Analyzer analyser; analyser.Run(&argument); } diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index bd24e8a7d9c20b8cd9c4e41a76ffc33a004a9a69..8a3af0a8ebd5bad7be7046fa399cca4920da3d71 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -222,10 +222,19 @@ Node *GraphTraits::NodesDFSIterator::operator->() { return stack_.top(); } +inline bool CheckNodeIndegreeEquals(const Node &node, size_t n) { + return node.inlinks.size() == n; +} + GraphTraits::NodesTSIterator::NodesTSIterator( const std::vector &source) { PADDLE_ENFORCE(!source.empty(), "Start points of topological sorting should not be empty!"); + // CHECK all the inputs' in-degree is 0 + for (auto *node : source) { + PADDLE_ENFORCE(CheckNodeIndegreeEquals(*node, 0)); + } + std::unordered_set visited; std::unordered_set to_visit{source.begin(), source.end()}; @@ -233,6 +242,11 @@ GraphTraits::NodesTSIterator::NodesTSIterator( while (!to_visit.empty()) { std::vector queue(to_visit.begin(), to_visit.end()); for (auto *p : queue) { + if (p->deleted()) { + visited.insert(p); + to_visit.erase(p); + continue; + } inlink_visited.clear(); std::copy_if(p->inlinks.begin(), p->inlinks.end(), @@ -292,6 +306,37 @@ Node *GraphTraits::NodesTSIterator::operator->() { return sorted_[cursor_]; } +std::pair, std::vector> +ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT + std::unordered_set nodes(graph.begin(), graph.end()); + std::unordered_set inputs; + std::unordered_set outputs; + // Input a Value, check whether its inlink is in the subgraph. + auto inlink_in_subgraph = [&](Node *n) { + for (auto *in : n->inlinks) { + if (nodes.count(in)) return true; + } + return false; + }; + for (auto &node : graph) { + for (auto *in : node->inlinks) { + // The Value that is written by nodes inside a sub-graph shouldn't be the + // input of the sub-graph. + if (!nodes.count(in) && in->type() == Node::Type::kValue && + !inlink_in_subgraph(in)) { + inputs.insert(in); + } + } + for (auto *out : node->outlinks) { + if (!nodes.count(out) && out->type() == Node::Type::kValue) { + outputs.insert(out); + } + } + } + return std::make_pair(std::vector(inputs.begin(), inputs.end()), + std::vector(outputs.begin(), outputs.end())); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 5dd914d1971bfb5bcc0b1db41d73e2b67120bc06..1c60d5de21538043962cc58a6f508aea635fe8c4 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -133,7 +133,7 @@ struct GraphTraits { private: std::vector sorted_; - int cursor_{0}; + size_t cursor_{0}; }; explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} @@ -173,36 +173,8 @@ struct GraphTraits { // Extract the inputs and outputs of a graph. The inputs and outputs of a // sub-graph is the inputs nodes and output nodes that doesn't inside the // sub-graph. -static std::pair, std::vector> -ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT - std::unordered_set nodes(graph.begin(), graph.end()); - std::unordered_set inputs; - std::unordered_set outputs; - // Input a Value, check whether its inlink is in the subgraph. - auto inlink_in_subgraph = [&](Node *n) { - for (auto *in : n->inlinks) { - if (nodes.count(in)) return true; - } - return false; - }; - for (auto &node : graph) { - for (auto *in : node->inlinks) { - // The Value that is written by nodes inside a sub-graph shouldn't be the - // input of the sub-graph. - if (!nodes.count(in) && in->type() == Node::Type::kValue && - !inlink_in_subgraph(in)) { - inputs.insert(in); - } - } - for (auto *out : node->outlinks) { - if (!nodes.count(out) && out->type() == Node::Type::kValue) { - outputs.insert(out); - } - } - } - return std::make_pair(std::vector(inputs.begin(), inputs.end()), - std::vector(outputs.begin(), outputs.end())); -} +std::pair, std::vector> +ExtractInputAndOutputOfSubGraph(std::vector &graph); } // namespace analysis } // namespace inference diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index 29ca008123addf07959b965a4b54bf55b18c401d..2328d870422c5a31c22d7b09980aae35e01b2b25 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -22,14 +22,18 @@ namespace paddle { namespace inference { + +DEFINE_int32(tensorrt_max_batchsize, 300, "TensorRT maximum batch size"); +DEFINE_int32(tensorrt_workspace_size, 2048, "TensorRT workspace size"); + namespace analysis { using framework::proto::ProgramDesc; std::vector ExtractParameters( - const std::vector>& nodes); + const std::vector> &nodes); -bool DataFlowGraphToFluidPass::Initialize(Argument* argument) { +bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument) ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc) PADDLE_ENFORCE(!argument->transformed_program_desc); @@ -47,76 +51,77 @@ bool DataFlowGraphToFluidPass::Initialize(Argument* argument) { bool DataFlowGraphToFluidPass::Finalize() { return true; } -void DataFlowGraphToFluidPass::Run(DataFlowGraph* graph) { - auto traits = GraphTraits(graph); - for (auto it = traits.nodes().begin(); it != traits.nodes().end(); ++it) { - if (it->deleted()) continue; +void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { + LOG(INFO) << "graph.inputs " << graph->inputs.size(); + for (auto &node : GraphTraits(graph).nodes_in_TS()) { + if (node.deleted()) continue; - switch (it->type()) { + switch (node.type()) { case Node::Type::kFunction: { - LOG(INFO) << "add function " << it->repr(); - AddFluidOp(&(*it)); + LOG(INFO) << "add function " << node.repr(); + AddFluidOp(&node); } break; case Node::Type::kFunctionBlock: { - LOG(INFO) << "add engine op " << it->repr() << " , " - << static_cast(&(*it))->subgraph.size(); - AddEngineOp(&(*it)); + LOG(INFO) << "add engine op " << node.repr() << " , " + << static_cast(&node)->subgraph.size(); + AddEngineOp(&node); } break; default: continue; } } + + PADDLE_ENFORCE(argument_->transformed_program_desc.get()); } -void DataFlowGraphToFluidPass::AddFluidOp(Node* node) { - auto* ori_op = static_cast(node->pb_desc()); +void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { + auto *ori_op = static_cast(node->pb_desc()); // currently only the main block is analyzed. - auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); - auto* op = main_block->add_ops(); + auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); + auto *op = main_block->add_ops(); *op = *ori_op; // copy the attributes, by default, these will not be changed - // by analysis phrase. + // by analysis phrase. // The inputs and outputs of the existing ops are not changed by tensorrt // subgraph pass. // NOTE It might be changed by other passes in the long run. } -void CreateTrtEngineOp(Node* node, const DataFlowGraph& graph, - const framework::proto::BlockDesc& block) { +void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, + const framework::proto::BlockDesc &block) { static int counter{0}; PADDLE_ENFORCE(node->IsFunctionBlock()); framework::OpDesc desc; - auto* func = static_cast(node); + auto *func = static_cast(node); // collect inputs std::vector io; - for (auto* x : func->inlinks) { + for (auto *x : func->inlinks) { io.push_back(x->name()); } desc.SetInput("Xs", io); // collect outputs io.clear(); - for (auto* x : func->outlinks) { + for (auto *x : func->outlinks) { io.push_back(x->name()); } desc.SetOutput("Ys", io); - desc.SetType("tensorrt_engine"); + + PADDLE_ENFORCE(!block.vars().empty(), "the block has no var-desc"); // Set attrs SetAttr(desc.Proto(), "subgraph", block.SerializeAsString()); - SetAttr(desc.Proto(), "engine_unique_key", - "trt-" + std::to_string(counter++)); - SetAttr(desc.Proto(), "max_batch", 100); // TODO(Superjomn) add config latter - SetAttr(desc.Proto(), "max_workspace", - 1024); // TODO(Superjomn) add config latter + SetAttr(desc.Proto(), "engine_uniq_key", "trt-" + std::to_string(counter++)); + SetAttr(desc.Proto(), "max_batch", FLAGS_tensorrt_max_batchsize); + SetAttr(desc.Proto(), "max_workspace", FLAGS_tensorrt_workspace_size); SetAttr(desc.Proto(), "parameters", ExtractParameters(graph.nodes.nodes())); node->SetPbMsg(desc.Proto()->SerializeAsString()); } std::vector ExtractParameters( - const std::vector>& nodes) { + const std::vector> &nodes) { std::vector parameters; - for (const auto& node : nodes) { + for (const auto &node : nodes) { if (!node->IsValue()) continue; PADDLE_ENFORCE(!node->pb_msg().empty(), "pb_msg should be set first"); framework::proto::VarDesc var; @@ -128,21 +133,30 @@ std::vector ExtractParameters( return parameters; } -void DataFlowGraphToFluidPass::AddEngineOp(Node* node) { +void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { // TODO(Superjomn) Here need to expose some arguments for default setting. PADDLE_ENFORCE(node->IsFunctionBlock()); - auto* block_node = static_cast(node); + auto *block_node = static_cast(node); framework::proto::BlockDesc proto; framework::BlockDesc block_desc(nullptr, &proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + LOG(INFO) << "origin variable size: " + << argument_->origin_program_desc->blocks(0).vars().size(); + LOG(INFO) << "transformed variable size: " + << block_desc.Proto()->vars().size(); // copy ops. - for (auto* node : block_node->subgraph) { - auto* op = block_desc.AppendOp(); + for (auto *node : block_node->subgraph) { + auto *op = block_desc.AppendOp(); PADDLE_ENFORCE(!node->pb_msg().empty()); op->Proto()->ParseFromString(node->pb_msg()); } + *block_desc.Proto()->mutable_vars() = + argument_->origin_program_desc->blocks(0).vars(); + PADDLE_ENFORCE(!block_desc.Proto()->vars().empty()); CreateTrtEngineOp(node, *argument_->main_dfg, *block_desc.Proto()); - auto* main_block = desc_->mutable_blocks(framework::kRootBlockIndex); - auto* op = main_block->add_ops(); + auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); + auto *op = main_block->add_ops(); PADDLE_ENFORCE(!node->pb_msg().empty(), "failed to set desc for block"); op->ParseFromString(node->pb_msg()); } @@ -151,7 +165,7 @@ namespace { class DFG_DebuggerPass : public DFG_GraphvizDrawPass { public: using Config = DFG_GraphvizDrawPass::Config; - explicit DFG_DebuggerPass(const Config& config) + explicit DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {} std::string repr() const override { return "dfg-to-fluid-debuger-pass"; } @@ -160,7 +174,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { }; } // namespace -Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { +Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( FLAGS_inference_analysis_graphviz_log_root, "data_flow_graph_to_fluid_graphviz_debugger")); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h index edc84b02ed20991e3e7c6c437d2b1fac169bae03..59c47365aa6c8ad5886c4515850d264f69cc4670 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h @@ -26,6 +26,10 @@ namespace paddle { namespace inference { + +DECLARE_int32(tensorrt_max_batchsize); +DECLARE_int32(tensorrt_workspace_size); + namespace analysis { class DataFlowGraphToFluidPass final : public DataFlowGraphPass { public: diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc index 162455b9c4e06b7fbb4bdede30444faf6a8a1509..65842b1e850953e77e3d4d28416609be271af9f1 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc @@ -40,7 +40,7 @@ TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { no++; } // DFG is sensitive to ProgramDesc, be careful to change the existing models. - ASSERT_EQ(no, 82); + ASSERT_EQ(no, 83); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index e918622d74cfb11d83090555be2a768cc14e7742..496921db9eabce1b1e40c7cb13089446ca93321c 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -28,7 +28,6 @@ bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); PADDLE_ENFORCE(argument); if (!argument->main_dfg) { - LOG(INFO) << "Init DFG"; argument->main_dfg.reset(new DataFlowGraph); } desc_ = argument->origin_program_desc.get(); @@ -51,6 +50,7 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { v->SetPbMsg(var.SerializeAsString()); var2id[var.name()] = v->id(); } + for (int i = 0; i < main_block.ops_size(); i++) { const auto &op = main_block.ops(i); auto *o = graph->nodes.Create(Node::Type::kFunction); @@ -62,19 +62,31 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { o->SetPbMsg(op.SerializeAsString()); // set inputs and outputs - // TODO(Superjomn) make sure the InputNames is the real variable name. + std::unordered_set inlinks; for (int j = 0; j < op.inputs_size(); j++) { auto &in_var = op.inputs(j); for (int k = 0; k < in_var.arguments_size(); k++) { auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); in->outlinks.push_back(o); o->inlinks.push_back(in); + inlinks.insert(in); } } for (int j = 0; j < op.outputs_size(); j++) { auto &out_var = op.outputs(j); for (int k = 0; k < out_var.arguments_size(); k++) { auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); + if (inlinks.count(out)) { + // Loop found, for example, a = op(a), use SSA, change to a1 = op(a). + auto *out_alias = graph->nodes.Create(Node::Type::kValue); + out_alias->SetName(out->name()); + out_alias->SetPbDesc(out->pb_desc()); + out_alias->SetPbMsg(out->pb_msg()); + var2id[out_alias->name()] = out_alias->id(); // update a -> a0 + LOG(INFO) << "loop found in graph, create SSA alias node [" + << out_alias->repr() << "] for [" << out->repr() << "]"; + out = out_alias; + } out->inlinks.push_back(o); o->outlinks.push_back(out); } diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc index cbca5abdd5fff1672ba5d47a8876489c54ad6947..dadb84059d21adab44159a6145b345460663cb96 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -24,12 +24,12 @@ namespace analysis { TEST_F(DFG_Tester, Init) { FluidToDataFlowGraphPass pass; pass.Initialize(&argument); - DataFlowGraph graph; - pass.Run(&graph); + pass.Run(argument.main_dfg.get()); // Analysis is sensitive to ProgramDesc, careful to change the original model. - ASSERT_EQ(graph.nodes.size(), 37UL); + ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL); pass.Finalize(); - LOG(INFO) << '\n' << graph.DotString(); + ASSERT_FALSE(argument.main_dfg->DotString().empty()); + EXPECT_FALSE(argument.main_dfg->inputs.empty()); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc index 9993de22800bc0aafdcbf46618e6b479ac1eb187..faf876de6d65d20cf7a084cd97392cfc8d791a42 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc @@ -25,6 +25,9 @@ TensorRTSubGraphPass::TensorRTSubGraphPass( void TensorRTSubGraphPass::Run(DataFlowGraph *graph) { SubGraphFuse(graph, node_inside_subgraph_teller_)(); + VLOG(4) << "debug info " + << graph->HumanReadableInfo(false /*show_values*/, + true /*show_functions*/); } } // namespace analysis diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 9d63d08dedf6a1bcdacc51bb83d2ed261bca4117..e28e144fd54cec06b0228ac9c478de7c641455a0 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -82,7 +82,7 @@ inference_api_test(test_api_impl if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine SRCS api_tensorrt_subgraph_engine.cc - DEPS paddle_inference_api analysis tensorrt_engine paddle_fluid_api) + DEPS paddle_inference_api analysis tensorrt_engine paddle_inference_api paddle_fluid_api tensorrt_converter) inference_api_test(test_api_tensorrt_subgraph_engine ARGS test_word2vec) endif() diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc index f6f3cb335897b02905e24c229b92f3940a37dbf8..0206ac60103759deda91be741617bde63e003de6 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.cc +++ b/paddle/fluid/inference/api/api_anakin_engine.cc @@ -39,7 +39,7 @@ bool PaddleInferenceAnakinPredictor::Init(const AnakinConfig &config) { bool PaddleInferenceAnakinPredictor::Run( const std::vector &inputs, - std::vector *output_data) { + std::vector *output_data, int batch_size) { for (const auto &input : inputs) { if (input.dtype != PaddleDType::FLOAT32) { LOG(ERROR) << "Only support float type inputs. " << input.name diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index 85ca83cd00756cca04d7b92437e9955d8ab297e7..def096c867ec85624f5b221782ef8b6240923c05 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -37,7 +37,8 @@ class PaddleInferenceAnakinPredictor : public PaddlePredictor { // NOTE Unlike the native engine, the buffers of anakin engine's output_data // should be allocated first. bool Run(const std::vector& inputs, - std::vector* output_data) override; + std::vector* output_data, + int batch_size = -1) override; std::unique_ptr Clone() override; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 786dc8e827806a9cea9dc01788fada2fd754b930..3ae255e13fc4f3ca28a6af62a5d5944d84303fc7 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -108,7 +108,8 @@ NativePaddlePredictor::~NativePaddlePredictor() { } bool NativePaddlePredictor::Run(const std::vector &inputs, - std::vector *output_data) { + std::vector *output_data, + int batch_size) { VLOG(3) << "Predictor::predict"; Timer timer; timer.tic(); diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 92e693578ab657004f3c40c09b979897afea1e1f..4f28c3cd34bade4189871210e6168c6c1c610c2c 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -38,7 +38,8 @@ class NativePaddlePredictor : public PaddlePredictor { bool Init(std::shared_ptr parent_scope); bool Run(const std::vector &inputs, - std::vector *output_data) override; + std::vector *output_data, + int batch_size = -1) override; std::unique_ptr Clone() override; diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index 0cdc88fa1eaf3935ce0da143e1e91eb84cd70dcf..c0891e9c281961fa03d278a0f5c676f92672c419 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/operators/tensorrt_engine_op.h" namespace paddle { @@ -64,16 +65,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { return false; } - // Analyze inference_program - Argument argument; - argument.origin_program_desc.reset( - new ProgramDesc(*inference_program_->Proto())); - Singleton::Global().Run(&argument); - CHECK(argument.transformed_program_desc); - VLOG(5) << "transformed program:\n" - << argument.transformed_program_desc->SerializeAsString(); - VLOG(5) << "to prepare executor"; - *inference_program_->Proto() = *argument.transformed_program_desc; + OptimizeInferenceProgram(); ctx_ = executor_->Prepare(*inference_program_, 0); VLOG(5) << "to create variables"; @@ -86,6 +78,29 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { return true; } + bool Run(const std::vector& inputs, + std::vector* output_data, + int batch_size = -1) override { + PADDLE_ENFORCE_GT(batch_size, 0, + "TensorRT engine needs the argument batch_size set"); + FLAGS_tensorrt_engine_batch_size = batch_size; + return NativePaddlePredictor::Run(inputs, output_data, batch_size); + } + + void OptimizeInferenceProgram() { + // Analyze inference_program + Argument argument; + argument.origin_program_desc.reset( + new ProgramDesc(*inference_program_->Proto())); + Singleton::Global().Run(&argument); + CHECK(argument.transformed_program_desc); + VLOG(5) << "transformed program:\n" + << argument.transformed_program_desc->SerializeAsString(); + VLOG(5) << "to prepare executor"; + inference_program_.reset( + new framework::ProgramDesc(*argument.transformed_program_desc)); + } + private: TensorRTConfig config_; }; diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index b8ba2d14a5c161d491d838888ea14b776f769f23..2f8b4f8596946988a728b5cf82de251bfda778a9 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -98,7 +98,8 @@ class PaddlePredictor { // responsible for the output tensor's buffer, either allocated or passed from // outside. virtual bool Run(const std::vector& inputs, - std::vector* output_data) = 0; + std::vector* output_data, + int batch_size = -1) = 0; // Clone a predictor that share the model weights, the Cloned predictor should // be thread-safe. diff --git a/paddle/fluid/inference/api/test_api.cc b/paddle/fluid/inference/api/test_api.cc index ac8a21a22be6f27311b8ae2507d04d9d1b510e76..7a579610eefda24c911edd28b5f3a178aa10ab1e 100644 --- a/paddle/fluid/inference/api/test_api.cc +++ b/paddle/fluid/inference/api/test_api.cc @@ -35,7 +35,8 @@ class DemoPredictor : public PaddlePredictor { LOG(INFO) << "I get other_config " << config.other_config; } bool Run(const std::vector &inputs, - std::vector *output_data) override { + std::vector *output_data, + int batch_size = 0) override { LOG(INFO) << "Run"; return false; } diff --git a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc index 585f6d29376c3341c21ff76361d5335512c1b1b6..62d98a796708612e7d4ff8abfd85125978ce22c7 100644 --- a/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/test_api_tensorrt_subgraph_engine.cc @@ -15,50 +15,79 @@ #include #include #include +#include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" namespace paddle { DEFINE_string(dirname, "", "Directory of the inference model."); -void Main(bool use_gpu) { +void CompareTensorRTWithFluid(bool enable_tensorrt) { + FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt; + //# 1. Create PaddlePredictor with a config. - TensorRTConfig config; - config.model_dir = FLAGS_dirname + "word2vec.inference.model"; - config.use_gpu = use_gpu; - config.fraction_of_gpu_memory = 0.15; - config.device = 0; - auto predictor = + NativeConfig config0; + config0.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config0.use_gpu = true; + config0.fraction_of_gpu_memory = 0.3; + config0.device = 0; + + TensorRTConfig config1; + config1.model_dir = FLAGS_dirname + "word2vec.inference.model"; + config1.use_gpu = true; + config1.fraction_of_gpu_memory = 0.3; + config1.device = 0; + + auto predictor0 = + CreatePaddlePredictor(config0); + auto predictor1 = CreatePaddlePredictor(config); + PaddleEngineKind::kAutoMixedTensorRT>(config1); - for (int batch_id = 0; batch_id < 3; batch_id++) { + for (int batch_id = 0; batch_id < 1; batch_id++) { //# 2. Prepare input. - int64_t data[4] = {1, 2, 3, 4}; + std::vector data(20); + for (int i = 0; i < 20; i++) data[i] = i; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor{ + .name = "", + .shape = std::vector({10, 1}), + .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)), + .dtype = PaddleDType::INT64}; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); //# 3. Run - std::vector outputs; - CHECK(predictor->Run(slots, &outputs)); + std::vector outputs0; + std::vector outputs1; + CHECK(predictor0->Run(slots, &outputs0)); + CHECK(predictor1->Run(slots, &outputs1, 10)); //# 4. Get output. - ASSERT_EQ(outputs.size(), 1UL); - LOG(INFO) << "output buffer size: " << outputs.front().data.length(); - const size_t num_elements = outputs.front().data.length() / sizeof(float); - // The outputs' buffers are in CPU memory. - for (size_t i = 0; i < std::min(5UL, num_elements); i++) { - LOG(INFO) << static_cast(outputs.front().data.data())[i]; + ASSERT_EQ(outputs0.size(), 1UL); + ASSERT_EQ(outputs1.size(), 1UL); + + const size_t num_elements = outputs0.front().data.length() / sizeof(float); + const size_t num_elements1 = outputs1.front().data.length() / sizeof(float); + EXPECT_EQ(num_elements, num_elements1); + + auto *data0 = static_cast(outputs0.front().data.data()); + auto *data1 = static_cast(outputs1.front().data.data()); + + ASSERT_GT(num_elements, 0UL); + for (size_t i = 0; i < std::min(num_elements, num_elements1); i++) { + EXPECT_NEAR(data0[i], data1[i], 1e-3); } } } -TEST(paddle_inference_api_tensorrt_subgraph_engine, main) { Main(true); } +TEST(paddle_inference_api_tensorrt_subgraph_engine, without_tensorrt) { + CompareTensorRTWithFluid(false); +} + +TEST(paddle_inference_api_tensorrt_subgraph_engine, with_tensorrt) { + CompareTensorRTWithFluid(true); +} } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 6697952051c4b1997ca6b550da17a52e64cb3454..968f7eb99ce8519edaa585fd3cb642bd80cc63cc 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -93,6 +93,10 @@ class OpConverter { framework::Scope* scope_{nullptr}; }; +} // namespace tensorrt +} // namespace inference +} // namespace paddle + #define REGISTER_TRT_OP_CONVERTER(op_type__, Converter__) \ struct trt_##op_type__##_converter : public ::paddle::framework::Registrar { \ trt_##op_type__##_converter() { \ @@ -111,7 +115,3 @@ class OpConverter { extern int TouchConverterRegister_##op_type__(); \ static int use_op_converter_trt_##op_type__ __attribute__((unused)) = \ TouchConverterRegister_##op_type__(); - -} // namespace tensorrt -} // namespace inference -} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 596e0fe9da3d272ecb1c0f8dbef09a75d08a4b1a..fefec0df6d03669a294ce9643b666d7416593708 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -26,18 +26,20 @@ namespace paddle { namespace inference { namespace tensorrt { -void TensorRTEngine::Build(const DescType& paddle_model) { +void TensorRTEngine::Build(const DescType &paddle_model) { PADDLE_ENFORCE(false, "not implemented"); } void TensorRTEngine::Execute(int batch_size) { - std::vector buffers; - for (auto& buf : buffers_) { + batch_size_ = batch_size; + std::vector buffers; + for (auto &buf : buffers_) { PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated"); PADDLE_ENFORCE_GT(buf.max_size, 0); PADDLE_ENFORCE(buf.device == DeviceType::GPU); buffers.push_back(buf.buffer); } + PADDLE_ENFORCE_NOT_NULL(stream_); infer_context_->enqueue(batch_size, buffers.data(), *stream_, nullptr); cudaStreamSynchronize(*stream_); } @@ -45,7 +47,7 @@ void TensorRTEngine::Execute(int batch_size) { TensorRTEngine::~TensorRTEngine() { cudaStreamSynchronize(*stream_); // clean buffer - for (auto& buf : buffers_) { + for (auto &buf : buffers_) { if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); buf.buffer = nullptr; @@ -70,32 +72,37 @@ void TensorRTEngine::FreezeNetwork() { // allocate GPU buffers. buffers_.resize(buffer_sizes_.size()); - for (auto& item : buffer_sizes_) { + for (auto &item : buffer_sizes_) { + // The output buffers are not set in the network building phrase, need to + // infer from the TesorRT network. if (item.second == 0) { auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str()); auto dims = infer_engine_->getBindingDimensions(slot_offset); item.second = kDataTypeSize[static_cast( infer_engine_->getBindingDataType(slot_offset))] * analysis::AccuDims(dims.d, dims.nbDims); + PADDLE_ENFORCE_GT(item.second, 0); } - auto& buf = buffer(item.first); + + auto &buf = buffer(item.first); + buf.max_size = item.second * max_batch_; CHECK(buf.buffer == nullptr); // buffer should be allocated only once. - PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second)); - VLOG(4) << "buffer malloc " << item.first << " " << item.second << " " - << buf.buffer; - buf.size = buf.max_size = item.second; + PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, buf.max_size)); + PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G + // buf.size will changed in the runtime. + buf.size = 0; buf.device = DeviceType::GPU; } } -nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, +nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, nvinfer1::DataType dtype, - const nvinfer1::Dims& dims) { + const nvinfer1::Dims &dims) { PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate input name %s", name); PADDLE_ENFORCE(infer_network_ != nullptr, "should initnetwork first"); - auto* input = infer_network_->addInput(name.c_str(), dtype, dims); + auto *input = infer_network_->addInput(name.c_str(), dtype, dims); PADDLE_ENFORCE(input, "infer network add input %s failed", name); buffer_sizes_[name] = kDataTypeSize[static_cast(dtype)] * analysis::AccuDims(dims.d, dims.nbDims); @@ -104,12 +111,12 @@ nvinfer1::ITensor* TensorRTEngine::DeclareInput(const std::string& name, return input; } -void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, - const std::string& name) { +void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer *layer, int offset, + const std::string &name) { PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", name); - auto* output = layer->getOutput(offset); + auto *output = layer->getOutput(offset); SetITensor(name, output); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); @@ -121,11 +128,11 @@ void TensorRTEngine::DeclareOutput(const nvinfer1::ILayer* layer, int offset, buffer_sizes_[name] = 0; } -void TensorRTEngine::DeclareOutput(const std::string& name) { +void TensorRTEngine::DeclareOutput(const std::string &name) { PADDLE_ENFORCE_EQ(0, buffer_sizes_.count(name), "duplicate output name %s", name); - auto* output = TensorRTEngine::GetITensor(name); + auto *output = TensorRTEngine::GetITensor(name); PADDLE_ENFORCE(output != nullptr); output->setName(name.c_str()); PADDLE_ENFORCE(!output->isNetworkInput()); @@ -135,38 +142,45 @@ void TensorRTEngine::DeclareOutput(const std::string& name) { buffer_sizes_[name] = 0; } -void* TensorRTEngine::GetOutputInGPU(const std::string& name) { +void *TensorRTEngine::GetOutputInGPU(const std::string &name) { return buffer(name).buffer; } -void TensorRTEngine::GetOutputInGPU(const std::string& name, void* dst, +void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, size_t max_size) { // determine data size auto it = buffer_sizes_.find(name); PADDLE_ENFORCE(it != buffer_sizes_.end()); PADDLE_ENFORCE_GT(it->second, 0); PADDLE_ENFORCE_GE(max_size, it->second); - auto& buf = buffer(name); + auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, it->second, cudaMemcpyDeviceToDevice, *stream_), 0); } -void TensorRTEngine::GetOutputInCPU(const std::string& name, void* dst, +void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, size_t max_size) { + VLOG(4) << "get output in cpu"; + auto &buf = buffer(name); + + // Update needed buffer size. + auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); + auto dims = infer_engine_->getBindingDimensions(slot_offset); + buf.size = kDataTypeSize[static_cast( + infer_engine_->getBindingDataType(slot_offset))] * + analysis::AccuDims(dims.d, dims.nbDims); + PADDLE_ENFORCE_LE(buf.size, buf.max_size); // determine data size - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_GE(max_size, it->second); - auto& buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, it->second, - cudaMemcpyDeviceToHost, *stream_)); + // DEBUG + memset(dst, 0, buf.size); + PADDLE_ENFORCE_EQ( + 0, cudaMemcpy(dst, buf.buffer, buf.size, cudaMemcpyDeviceToHost)); } -Buffer& TensorRTEngine::buffer(const std::string& name) { +Buffer &TensorRTEngine::buffer(const std::string &name) { PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first."); auto it = buffer_sizes_.find(name); PADDLE_ENFORCE(it != buffer_sizes_.end()); @@ -174,19 +188,23 @@ Buffer& TensorRTEngine::buffer(const std::string& name) { return buffers_[slot_offset]; } -void TensorRTEngine::SetInputFromCPU(const std::string& name, const void* data, +void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, size_t size) { - auto& buf = buffer(name); + auto &buf = buffer(name); PADDLE_ENFORCE_NOT_NULL(buf.buffer); + PADDLE_ENFORCE_NOT_NULL(data); + PADDLE_ENFORCE_NOT_NULL(stream_); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); + buf.size = size; PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, cudaMemcpyHostToDevice, *stream_)); } -void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data, +void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, size_t size) { - auto& buf = buffer(name); + auto &buf = buffer(name); + buf.size = size; PADDLE_ENFORCE_NOT_NULL(buf.buffer); PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); PADDLE_ENFORCE(buf.device == DeviceType::GPU); @@ -194,15 +212,15 @@ void TensorRTEngine::SetInputFromGPU(const std::string& name, const void* data, cudaMemcpyDeviceToDevice, *stream_)); } -void TensorRTEngine::SetITensor(const std::string& name, - nvinfer1::ITensor* tensor) { +void TensorRTEngine::SetITensor(const std::string &name, + nvinfer1::ITensor *tensor) { PADDLE_ENFORCE(tensor != nullptr); PADDLE_ENFORCE_EQ(0, itensor_map_.count(name), "duplicate ITensor name %s", name); itensor_map_[name] = tensor; } -nvinfer1::ITensor* TensorRTEngine::GetITensor(const std::string& name) { +nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) { PADDLE_ENFORCE(itensor_map_.count(name), "no ITensor %s", name); return itensor_map_[name]; } diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index b06a9bbc6758ae9410b2fce99ef2b1a9e7ab98c0..7064d333f6db754f88c0ac6956a9527a48bf866c 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -57,7 +57,9 @@ class TensorRTEngine : public EngineBase { : max_batch_(max_batch), max_workspace_(max_workspace), stream_(stream ? stream : &default_stream_), - logger_(logger) {} + logger_(logger) { + cudaStreamCreate(&default_stream_); + } virtual ~TensorRTEngine(); @@ -121,6 +123,9 @@ class TensorRTEngine : public EngineBase { int max_batch_; // the max memory size the engine uses int max_workspace_; + + // batch size of the current data, will be updated each Executation. + int batch_size_{-1}; cudaStream_t* stream_; // If stream_ is not set from outside, hold its own stream. cudaStream_t default_stream_; diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index e635f0f87d577a1f1ac74687ee60f762be525418..fca3488008ed83418b5e28b8af42d8019aaaa2a4 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -103,6 +103,10 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { LOG(INFO) << "to get output"; float y_cpu[2] = {-1., -1.}; + auto dims = engine_->GetITensor("y")->getDimensions(); + ASSERT_EQ(dims.nbDims, 3); + ASSERT_EQ(dims.d[0], 2); + ASSERT_EQ(dims.d[1], 1); engine_->GetOutputInCPU("y", &y_cpu[0], sizeof(float) * 2); ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index d265150f25419509126028e36e629aee3ee6bd0f..4e2002ad24415437ae4f85eba0e90a6c689e2996 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -168,6 +168,8 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(relu);\n") elseif(${TARGET} STREQUAL "fake_dequantize") file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n") + elseif(${TARGET} STREQUAL "tensorrt_engine_op") + message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference") else() file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") endif() @@ -237,9 +239,9 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_op DEPS softmax) op_library(sequence_softmax_op DEPS softmax) if (WITH_GPU AND TENSORRT_FOUND) - op_library(tensorrt_engine_op DEPS tensorrt_engine) + op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter) nv_test(test_tensorrt_engine_op SRCS tensorrt_engine_op_test.cc - DEPS tensorrt_engine_op tensorrt_engine tensorrt_converter + DEPS tensorrt_engine_op analysis) else() set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op) diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc index dcd73e3c3e40f80e07b73944d1f0cc57fea010d3..5f43c5810812260c4384349bdb709716c9a182f5 100644 --- a/paddle/fluid/operators/momentum_op.cc +++ b/paddle/fluid/operators/momentum_op.cc @@ -98,7 +98,7 @@ The update equations are as follows: $$ velocity = mu * velocity + gradient \\ if (use\_nesterov): \\ - param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\ + param = param - (gradient + mu * velocity) * learning\_rate \\ else: \\ param = param - learning\_rate * velocity. \\ $$ diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu index 5eb9d9950248bb50bb823f071c7fff0ddcc47234..a3932db1f3a50305d585cd3d5e86fa1b527df78b 100644 --- a/paddle/fluid/operators/momentum_op.cu +++ b/paddle/fluid/operators/momentum_op.cu @@ -30,7 +30,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v, T g_val = g[i]; T v_new = v[i] * mu + g_val; v_out[i] = v_new; - p_out[i] = p[i] - (g_val - v_new * mu) * lr; + p_out[i] = p[i] - (g_val + v_new * mu) * lr; } } else { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h index 04a1929b84a93af6465bacfe7974a1530296946d..264726040fb566a52b8c0cdee0a1524197d2a675 100644 --- a/paddle/fluid/operators/momentum_op.h +++ b/paddle/fluid/operators/momentum_op.h @@ -46,7 +46,7 @@ class MomentumOpKernel : public framework::OpKernel { v_out = v * mu + g; if (use_nesterov) { - p_out = p - (g - v_out * mu) * lr[0]; + p_out = p - (g + v_out * mu) * lr[0]; } else { p_out = p - lr[0] * v_out; } diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index 647cfc0a0af2be85e2868c6f68cab962c6631a8d..43672d6db92a981f0fbe6e8f7079dafc6ae4052e 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -24,6 +24,9 @@ #include "paddle/fluid/operators/tensorrt_engine_op.h" namespace paddle { + +DEFINE_int32(tensorrt_engine_batch_size, 1, "the batch_size of TensorRT"); + namespace operators { using inference::Singleton; @@ -52,7 +55,6 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { "TensorRT' tensor input requires at least 2 dimensions"); PADDLE_ENFORCE_LE(shape.size(), 4UL, "TensorRT' tensor input requires at most 4 dimensions"); - switch (shape.size()) { case 2: return nvinfer1::Dims2(shape[0], shape[1]); @@ -90,27 +92,36 @@ void TensorRTEngineKernel::Prepare( engine->InitNetwork(); framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); + VLOG(4) << "parsed var size " << block.AllVars().size(); // Add inputs VLOG(4) << "declare inputs"; for (auto &input : context.Inputs("Xs")) { VLOG(4) << "declare input " << input; auto *var = block.FindVar(input); + // TensorRT engine need to create parameters. The parameter's description + // should be set in + PADDLE_ENFORCE(var, "no variable called %s", input); PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, "TensorRT engine only takes LoDTensor as input"); auto shape = var->GetShape(); + // For the special batch_size placeholder -1, drop it and pass the real + // shape of data. + // TODO(Superjomn) fix this with batch broadcast, or it can't handle + // variational batch size. + if (shape[0] == -1) { + shape[0] = FLAGS_tensorrt_engine_batch_size; + } engine->DeclareInput( input, FluidDataType2TRT( var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(var->GetShape())); + Vec2TRT_Dims(shape)); } inference::Singleton::Global().ConvertBlock( block_desc, parameters, context.scope(), engine); // Add outputs - VLOG(4) << "declare outputs"; for (auto &output : context.Outputs("Ys")) { - VLOG(4) << "declare output " << output; engine->DeclareOutput(output); } @@ -151,4 +162,7 @@ REGISTER_OP_CPU_KERNEL( ops::TensorRTEngineKernel, ops::TensorRTEngineKernel); +// A trick to compile with the needed TensorRT op converter. +USE_TRT_CONVERTER(mul) + #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h index 1602a913aeebe43fabe2f9c9036edd18ac4c70fd..a332d70030ffa6a033f6b2b33487a4fd279b7016 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt_engine_op.h @@ -24,6 +24,9 @@ #include "paddle/fluid/inference/tensorrt/engine.h" namespace paddle { + +DECLARE_int32(tensorrt_engine_batch_size); + namespace operators { using inference::Singleton; @@ -53,7 +56,6 @@ template class TensorRTEngineKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - VLOG(4) << "TensorRTEngineKernel executing"; auto engine_name = context.Attr("engine_uniq_key"); if (!Singleton::Global().HasEngine(engine_name)) { Prepare(context); @@ -61,11 +63,8 @@ class TensorRTEngineKernel : public framework::OpKernel { auto* engine = Singleton::Global().Get(engine_name); auto input_names = context.op().Inputs("Xs"); PADDLE_ENFORCE(!input_names.empty(), "should pass more than one inputs"); - // Try to determine a batch_size - auto& tensor0 = inference::analysis::GetFromScope( - context.scope(), input_names.front()); - int batch_size = tensor0.dims()[0]; - PADDLE_ENFORCE_LE(batch_size, context.Attr("max_batch")); + PADDLE_ENFORCE_LE(FLAGS_tensorrt_engine_batch_size, + context.Attr("max_batch")); // Convert input tensor from fluid to engine. for (const auto& x : context.Inputs("Xs")) { @@ -81,8 +80,8 @@ class TensorRTEngineKernel : public framework::OpKernel { } } // Execute the engine. - PADDLE_ENFORCE_GT(batch_size, 0); - engine->Execute(batch_size); + PADDLE_ENFORCE_GT(FLAGS_tensorrt_engine_batch_size, 0); + engine->Execute(FLAGS_tensorrt_engine_batch_size); // Convert output tensor from engine to fluid for (const auto& y : context.Outputs("Ys")) { // convert output and copy to fluid. @@ -94,18 +93,21 @@ class TensorRTEngineKernel : public framework::OpKernel { auto* fluid_v = context.scope().FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto* fluid_t = fluid_v->GetMutable(); - fluid_t->Resize(framework::make_ddim(ddim)); auto size = inference::analysis::AccuDims(dims.d, dims.nbDims); - if (platform::is_cpu_place(fluid_t->place())) { - // TODO(Superjomn) change this float to dtype size. - engine->GetOutputInCPU( - y, fluid_t->mutable_data(platform::CPUPlace()), - size * sizeof(float)); - } else { - engine->GetOutputInGPU( - y, fluid_t->mutable_data(platform::CUDAPlace()), - size * sizeof(float)); - } + fluid_t->Resize(framework::make_ddim(ddim)); + + // TODO(Superjomn) find some way to determine which device to output the + // tensor. + // if (platform::is_cpu_place(fluid_t->place())) { + // TODO(Superjomn) change this float to dtype size. + engine->GetOutputInCPU(y, + fluid_t->mutable_data(platform::CPUPlace()), + size * sizeof(float)); + //} else { + // engine->GetOutputInGPU( + // y, fluid_t->mutable_data(platform::CUDAPlace()), + // size * sizeof(float)); + //} } cudaStreamSynchronize(*engine->stream()); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 65e7b10625583327c2919efd45ef1fe01fa9eb3c..9e58a39eb0939fa15e9c19e1e6fc89a6f99d9a0c 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -333,8 +333,7 @@ function assert_api_not_changed() { python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate - # Use git diff --name-only HEAD^ may not get file changes for update commits in one PR - API_CHANGE=`echo $CHANGED_FILES | grep "paddle/fluid/API.spec" || true` + API_CHANGE=`git diff --name-only upstream/develop | grep "paddle/fluid/API.spec" || true` echo "checking API.spec change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then # TODO: curl -H 'Authorization: token ${TOKEN}' diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 56124663929d1e33b7144ab57ae3b3c55e1652b3..ab40d0c217f565493b30d9a4cb3a600863122bc7 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -166,7 +166,8 @@ def fc(input, param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable parameters/weights of this layer. bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias - of this layer. If it is set to None, no bias will be added to the output units. + of this layer. If it is set to False, no bias will be added to the output units. + If it is set to None, the bias is initialized zero. Default: None. act (str, default None): Activation to be applied to the output of this layer. is_test(bool): A flag indicating whether execution is in test phase. use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 7fc8e106fb43666be9c1ea245994dc1c7ac85d7d..3fe99f55011ab7f745c3ad98ec44dfe277a13e05 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -324,7 +324,7 @@ class MomentumOptimizer(Optimizer): & if (use\_nesterov): - &\quad param = param - gradient * learning\_rate + mu * velocity * learning\_rate + &\quad param = param - (gradient + mu * velocity) * learning\_rate & else: diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 8c564abf986d351b31e993b62bfe1f17c52a4e10..322d76515e76c3d322ac7c4f989bbc95875cb654 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -48,6 +48,7 @@ list(REMOVE_ITEM TEST_OPS test_warpctc_op) list(REMOVE_ITEM TEST_OPS test_dist_train) list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf) list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed) +list(REMOVE_ITEM TEST_OPS test_dist_se_resnext) foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) @@ -60,3 +61,4 @@ if(WITH_DISTRIBUTE) endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) +py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext SERIAL) diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..72bc1729b0f63b23ad7ecb5ad703b984a4c614ac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -0,0 +1,350 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +from paddle.fluid import core +import unittest +from multiprocessing import Process +import os +import sys +import signal + +# Fix seed for test +fluid.default_startup_program().random_seed = 1 +fluid.default_main_program().random_seed = 1 + +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + + +class SE_ResNeXt(): + def __init__(self, layers=50): + self.params = train_parameters + self.layers = layers + + def net(self, input, class_dim=1000): + layers = self.layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + if layers == 50: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 6, 3] + num_filters = [128, 256, 512, 1024] + + conv = self.conv_bn_layer( + input=input, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 101: + cardinality = 32 + reduction_ratio = 16 + depth = [3, 4, 23, 3] + num_filters = [128, 256, 512, 1024] + + conv = self.conv_bn_layer( + input=input, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + elif layers == 152: + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + + conv = self.conv_bn_layer( + input=input, + num_filters=64, + filter_size=3, + stride=2, + act='relu') + conv = self.conv_bn_layer( + input=conv, num_filters=64, filter_size=3, stride=1, act='relu') + conv = self.conv_bn_layer( + input=conv, + num_filters=128, + filter_size=3, + stride=1, + act='relu') + conv = fluid.layers.pool2d( + input=conv, pool_size=3, pool_stride=2, pool_padding=1, \ + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + drop = fluid.layers.dropout(x=pool, dropout_prob=0.2) + stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) + out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') + return out + + def shortcut(self, input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + filter_size = 1 + return self.conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + def bottleneck_block(self, input, num_filters, stride, cardinality, + reduction_ratio): + conv0 = self.conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + groups=cardinality, + act='relu') + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) + scale = self.squeeze_excitation( + input=conv2, + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + short = self.shortcut(input, num_filters * 2, stride) + + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) / 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act) + + def squeeze_excitation(self, input, num_channels, reduction_ratio): + pool = fluid.layers.pool2d( + input=input, pool_size=0, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + squeeze = fluid.layers.fc(input=pool, + size=num_channels / reduction_ratio, + act='relu') + stdv = 1.0 / math.sqrt(squeeze.shape[1] * 1.0) + excitation = fluid.layers.fc(input=squeeze, + size=num_channels, + act='sigmoid') + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +def get_model(batch_size): + # Input data + image = fluid.layers.fill_constant( + shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) + label = fluid.layers.fill_constant( + shape=[batch_size, 1], dtype='int64', value=0.0) + + # Train program + model = SE_ResNeXt(layers=50) + out = model.net(input=image, class_dim=102) + cost = fluid.layers.cross_entropy(input=out, label=label) + + avg_cost = fluid.layers.mean(x=cost) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + + # Evaluator + test_program = fluid.default_main_program().clone(for_test=True) + + # Optimization + total_images = 6149 # flowers + epochs = [30, 60, 90] + step = int(total_images / batch_size + 1) + + bd = [step * e for e in epochs] + base_lr = 0.1 + lr = [] + lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] + + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=0.9, + regularization=fluid.regularizer.L2Decay(1e-4)) + optimizer.minimize(avg_cost) + + # Reader + train_reader = paddle.batch( + paddle.dataset.flowers.train(), batch_size=batch_size) + test_reader = paddle.batch( + paddle.dataset.flowers.test(), batch_size=batch_size) + + return test_program, avg_cost, train_reader, test_reader, acc_top1, out + + +def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id=trainer_id, + program=main_program, + pservers=pserver_endpoints, + trainers=trainers) + return t + + +class DistSeResneXt2x2: + def run_pserver(self, pserver_endpoints, trainers, current_endpoint, + trainer_id): + get_model(batch_size=2) + t = get_transpiler(trainer_id, + fluid.default_main_program(), pserver_endpoints, + trainers) + pserver_prog = t.get_pserver_program(current_endpoint) + startup_prog = t.get_startup_program(current_endpoint, pserver_prog) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_prog) + exe.run(pserver_prog) + + def _wait_ps_ready(self, pid): + retry_times = 20 + while True: + assert retry_times >= 0, "wait ps ready failed" + time.sleep(3) + print("waiting ps ready: ", pid) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + retry_times -= 1 + + def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): + test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model( + batch_size=20) + if is_dist: + t = get_transpiler(trainer_id, + fluid.default_main_program(), endpoints, + trainers) + trainer_prog = t.get_trainer_program() + else: + trainer_prog = fluid.default_main_program() + + startup_exe = fluid.Executor(place) + startup_exe.run(fluid.default_startup_program()) + + strategy = fluid.ExecutionStrategy() + strategy.num_threads = 1 + strategy.allow_op_delay = False + exe = fluid.ParallelExecutor( + True, + loss_name=avg_cost.name, + exec_strategy=strategy, + num_trainers=trainers, + trainer_id=trainer_id) + + feed_var_list = [ + var for var in trainer_prog.global_block().vars.itervalues() + if var.is_data + ] + + feeder = fluid.DataFeeder(feed_var_list, place) + reader_generator = train_reader() + first_loss, = exe.run(fetch_list=[avg_cost.name]) + print(first_loss) + for i in xrange(5): + loss, = exe.run(fetch_list=[avg_cost.name]) + last_loss, = exe.run(fetch_list=[avg_cost.name]) + print(last_loss) + + +def main(role="pserver", + endpoints="127.0.0.1:9123", + trainer_id=0, + current_endpoint="127.0.0.1:9123", + trainers=1, + is_dist=True): + model = DistSeResneXt2x2() + if role == "pserver": + model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) + else: + p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) + + +if __name__ == "__main__": + if len(sys.argv) != 7: + print( + "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" + ) + role = sys.argv[1] + endpoints = sys.argv[2] + trainer_id = int(sys.argv[3]) + current_endpoint = sys.argv[4] + trainers = int(sys.argv[5]) + is_dist = True if sys.argv[6] == "TRUE" else False + main( + role=role, + endpoints=endpoints, + trainer_id=trainer_id, + current_endpoint=current_endpoint, + trainers=trainers, + is_dist=is_dist) diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..e3e7036f08cb88087ae45fe7d7c7565c102dab8a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -0,0 +1,122 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import argparse +import time +import math + +import unittest +import os +import signal +import subprocess + + +class TestDistSeResneXt2x2(unittest.TestCase): + def setUp(self): + self._trainers = 2 + self._pservers = 2 + self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" + self._python_interp = "python" + + def start_pserver(self): + ps0_ep, ps1_ep = self._ps_endpoints.split(",") + ps0_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \ + (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers) + ps1_cmd = "%s dist_se_resnext.py pserver %s 0 %s %d TRUE" % \ + (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers) + + ps0_proc = subprocess.Popen( + ps0_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + ps1_proc = subprocess.Popen( + ps1_cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + return ps0_proc, ps1_proc + + def _wait_ps_ready(self, pid): + retry_times = 20 + while True: + assert retry_times >= 0, "wait ps ready failed" + time.sleep(3) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + retry_times -= 1 + + def non_test_with_place(self): + # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN + required_envs = { + "PATH": os.getenv("PATH"), + "PYTHONPATH": os.getenv("PYTHONPATH"), + "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH"), + "FLAGS_fraction_of_gpu_memory_to_use": "0.15" + } + # Run local to get a base line + env_local = {"CUDA_VISIBLE_DEVICES": "0"} + env_local.update(required_envs) + local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \ + (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1) + local_proc = subprocess.Popen( + local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local) + local_proc.wait() + local_ret = local_proc.stdout.read() + + # Run dist train to compare with local results + ps0, ps1 = self.start_pserver() + self._wait_ps_ready(ps0.pid) + self._wait_ps_ready(ps1.pid) + + ps0_ep, ps1_ep = self._ps_endpoints.split(",") + tr0_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d TRUE" % \ + (self._python_interp, self._ps_endpoints, ps0_ep, self._trainers) + tr1_cmd = "%s dist_se_resnext.py trainer %s 1 %s %d TRUE" % \ + (self._python_interp, self._ps_endpoints, ps1_ep, self._trainers) + + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + env0.update(required_envs) + env1.update(required_envs) + FNULL = open(os.devnull, 'w') + + tr0_proc = subprocess.Popen( + tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0) + tr1_proc = subprocess.Popen( + tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1) + + tr0_proc.wait() + tr1_proc.wait() + loss_data0 = tr0_proc.stdout.read() + lines = loss_data0.split("\n") + dist_first_loss = eval(lines[0].replace(" ", ","))[0] + dist_last_loss = eval(lines[1].replace(" ", ","))[0] + + local_lines = local_ret.split("\n") + local_first_loss = eval(local_lines[0])[0] + local_last_loss = eval(local_lines[1])[0] + + self.assertAlmostEqual(local_first_loss, dist_first_loss) + self.assertAlmostEqual(local_last_loss, dist_last_loss) + + # check tr0_out + # FIXME: ensure the server process is killed + # replace with ps0.terminate() + os.kill(ps0.pid, signal.SIGKILL) + os.kill(ps1.pid, signal.SIGKILL) + FNULL.close() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index aaea9c1809213c5707e8540eebbdd6f269836fdc..c75d3bd276a5b494090c1aa1fea0bb4f2c067173 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -39,7 +39,7 @@ class TestMomentumOp1(OpTest): velocity_out = mu * velocity + grad if use_nesterov: - param_out = param - grad * learning_rate + \ + param_out = param - grad * learning_rate - \ velocity_out * mu * learning_rate else: param_out = param - learning_rate * velocity_out @@ -75,7 +75,7 @@ class TestMomentumOp2(OpTest): velocity_out = mu * velocity + grad if use_nesterov: - param_out = param - grad * learning_rate + \ + param_out = param - grad * learning_rate - \ velocity_out * mu * learning_rate else: param_out = param - learning_rate * velocity_out