diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 8ac157c4d79f1f5f2a655152f46b4a4d3f2c6962..e0556a0babc74ba6efa0a190d4f7b77416bef3bf 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -102,7 +102,6 @@ set(COMMON_FLAGS -fno-omit-frame-pointer -Wall -Wextra - -Werror -Wnon-virtual-dtor -Wdelete-non-virtual-dtor -Wno-unused-parameter @@ -115,6 +114,11 @@ set(COMMON_FLAGS -Wno-error=terminate # Warning in PADDLE_ENFORCE ) +# https://github.com/PaddlePaddle/Paddle/issues/12773 +if (NOT WIN32) +list(APPEND COMMON_FLAGS -Werror) +endif() + set(GPU_COMMON_FLAGS -fPIC -fno-omit-frame-pointer diff --git a/doc/fluid/design/others/graph_survey.md b/doc/fluid/design/others/graph_survey.md index 6c6db08f463ae0a2b94fc4546f123a1d7c151870..97f395133b48a1d0ed5136f0ebc8720b8ca87ded 100644 --- a/doc/fluid/design/others/graph_survey.md +++ b/doc/fluid/design/others/graph_survey.md @@ -28,7 +28,7 @@ def get_symbol(num_classes=10, **kwargs): -Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null. +Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own AnyAttr. There is a op field in AnyAttr class, when a Symbol represents Variable(often input data), the op field is null. Symbol contains a data member, std::vector outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph. diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e963902a50200b785284e8f233fcca1abf459140..9250cde1b2bc8fa1e14c0ba1ea9b509c496fc506 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -78,7 +78,7 @@ paddle.fluid.io.load_vars ArgSpec(args=['executor', 'dirname', 'main_program', ' paddle.fluid.io.load_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.load_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_inference_model ArgSpec(args=['dirname', 'feeded_var_names', 'target_vars', 'executor', 'main_program', 'model_filename', 'params_filename', 'export_for_deployment'], varargs=None, keywords=None, defaults=(None, None, None, True)) -paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename'], varargs=None, keywords=None, defaults=(None, None)) +paddle.fluid.io.load_inference_model ArgSpec(args=['dirname', 'executor', 'model_filename', 'params_filename', 'pserver_endpoints'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.io.get_inference_program ArgSpec(args=['target_vars', 'main_program'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.initializer.ConstantInitializer.__init__ ArgSpec(args=['self', 'value', 'force_cpu'], varargs=None, keywords=None, defaults=(0.0, False)) paddle.fluid.initializer.UniformInitializer.__init__ ArgSpec(args=['self', 'low', 'high', 'seed'], varargs=None, keywords=None, defaults=(-1.0, 1.0, 0)) @@ -153,6 +153,7 @@ paddle.fluid.layers.image_resize ArgSpec(args=['input', 'out_shape', 'scale', 'n paddle.fluid.layers.image_resize_short ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)) paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.scatter ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) @@ -250,7 +251,6 @@ paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwarg paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.scatter ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 923a7083d4f30b646bbab03d79992b275aa2b403..da0955a9a000e0d0bff3fe9d0bc3bd25171be3d2 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -5,8 +5,12 @@ cc_library(pass SRCS pass.cc DEPS graph node graph_helper) cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) cc_library(graph_traits SRCS graph_traits.cc DEPS graph) cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits) +cc_library(fc_fuse_pass SRCS fc_fuse_pass.cc DEPS graph graph_pattern_detecter) +cc_library(infer_clean_graph_pass SRCS infer_clean_graph_pass.cc DEPS graph pass) + cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter) +cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass graph_pattern_detecter graph pass graph_traits framework_proto) diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4327742eac843f27385c165216ce48ceb97ea71 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_fuse_pass.h" +#include +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +bool VarOutLinksToOp(Node* node, const std::string& op_type) { + for (auto* out : node->outputs) { + if (out->IsOp() && out->Op()->Type() == op_type) { + return true; + } + } + return false; +} + +void BuildFCPattern(PDPattern* pattern) { + // make sure the selected MUL op has one input argument is a parameter. + auto* mul_parameter_var = pattern->NewNode( + [](Node* node) { + return node->IsVar() && node->outputs.size() == 1UL && + node->outputs.front()->Op()->Type() == "mul" && node->Var() && + node->Var()->Persistable(); // check is a parameter + }, + "mul_weight" /*name*/); + + auto* mul_tmp_input_var = pattern->NewNode( + [](Node* node) { + bool result = + node->IsVar() && node->outputs.size() >= 1UL && node->Var() && + !node->Var()->Persistable(); // this input is not an parameter. + if (!result) return false; + // check whether one output is MUL op. + for (auto* op : node->outputs) { + if (op->IsOp() && op->Op()->Type() == "mul") return true; + } + return false; + }, + "mul_tmp_var" /*name*/); + + // select a MUL op + auto* mul_op = pattern->NewNode( + [](Node* node) { + return node->IsOp() && // start from an Op + node->Op()->Type() == "mul"; // type is mul + // the output should be consumed only by one element_add, that check + // leaves in a Var PDNode. + }, + "mul" /*name*/); + + // make sure the MUL op's output has only one consumer and links to an + // ELEMENTWISE_ADD op. + auto* mul_out_var = pattern->NewNode( + [](Node* node) { + return node->IsVar() && // starts from a Var + node->outputs.size() == 1UL && // only has one consumer + node->outputs.front()->IsOp() && // check basic logic + node->Var() && // not a ControlDepVar + node->outputs.front()->Op()->Type() == + "elementwise_add"; // a very strong validation + }, + "mul_out"); + // this check is not essential, just to make the corresponding variable Node + // retrival easier. + auto* elementwise_add_tmp_var = pattern->NewNode( + [](Node* node) { + return node->IsVar() && node->outputs.size() >= 1UL && node->Var() && + VarOutLinksToOp(node, "elementwise_add"); + }, + "elementwise_add_tmpvar"); + + // select an ELEMENTWISE_ADD op + auto* elementwise_add_op = pattern->NewNode( + [](Node* node) { + return node->IsOp() && node->Op()->Type() == "elementwise_add"; + }, + "elementwise_add" /*name*/); + + // get the ELEMENTWISE_ADD op's output + auto* elementwise_add_out_var = pattern->NewNode( + [](Node* node) { + return node->IsVar() && node->inputs.size() == 1UL && node->Var() && + node->inputs.front()->Op()->Type() == "elementwise_add"; + }, + "elementwise_add_out"); + + pattern->AddEdge(mul_parameter_var, mul_op); + pattern->AddEdge(mul_tmp_input_var, mul_op); + pattern->AddEdge(mul_op, mul_out_var); + pattern->AddEdge(mul_out_var, elementwise_add_op); + pattern->AddEdge(elementwise_add_tmp_var, elementwise_add_op); + pattern->AddEdge(elementwise_add_op, elementwise_add_out_var); +} + +// Replace the node `from` in the links to `to` +bool LinksReplace(std::vector* links, Node* from, Node* to) { + for (auto*& n : *links) { + if (n == from) { + n = to; + return true; + } + } + return false; +} + +std::unique_ptr FCFusePass::ApplyImpl( + std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + + std::unordered_set nodes2delete; + + GraphPatternDetecter gpd; + BuildFCPattern(gpd.mutable_pattern()); + +#define GET_NODE(id) \ + PADDLE_ENFORCE(subgraph.count(gpd.pattern().RetriveNode(#id)), \ + "pattern has no Node called %s", #id); \ + auto* id = subgraph.at(gpd.pattern().RetriveNode(#id)); \ + PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); + + auto handler = [&](const GraphPatternDetecter::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "handle FC fuse"; + // Currently, there is no FC op available, so I will just simulate the + // scenerio. + // FC's fusion is simple, just op fuse, no need to process the + // parameters. + GET_NODE(mul_tmp_var); // x + GET_NODE(mul_weight); // Y + GET_NODE(elementwise_add_tmpvar); // bias + GET_NODE(elementwise_add_out); // Out + GET_NODE(mul); // MUL op + GET_NODE(elementwise_add); // ELEMENT_ADD op + GET_NODE(mul_out); // tmp +#undef GET_NODE + + // Create an FC Node. + OpDesc desc; + std::string fc_x_in = mul_tmp_var->Name(); + std::string fc_Y_in = mul_weight->Name(); + std::string fc_bias_in = elementwise_add_tmpvar->Name(); + std::string fc_out = elementwise_add_out->Name(); + desc.SetInput("Input", std::vector({fc_x_in})); + desc.SetInput("W", std::vector({fc_Y_in})); + desc.SetInput("Bias", std::vector({fc_bias_in})); + desc.SetOutput("Out", std::vector({fc_out})); + desc.SetType("fc"); + auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. + fc_node->inputs = + std::vector({mul_tmp_var, mul_weight, elementwise_add_tmpvar}); + fc_node->outputs.push_back(elementwise_add_out); + + // Update link relatons + PADDLE_ENFORCE(LinksReplace(&mul_tmp_var->outputs, mul, fc_node)); + PADDLE_ENFORCE(LinksReplace(&mul_weight->outputs, mul, fc_node)); + PADDLE_ENFORCE(LinksReplace(&elementwise_add_tmpvar->outputs, + elementwise_add, fc_node)); + PADDLE_ENFORCE( + LinksReplace(&elementwise_add_out->inputs, elementwise_add, fc_node)); + + // Drop old nodes + graph->RemoveNode(mul); + graph->RemoveNode(elementwise_add); + graph->RemoveNode(mul_out); // tmp variable + }; + + gpd(graph.get(), handler); + + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fc_fuse_pass, paddle::framework::ir::FCFusePass); diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..eb43dd4486cda578804fb9f6438c67e9e4a03091 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -0,0 +1,36 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detecter.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Fuse the MUL and ELEMENTWISE_ADD to a FCOp. + */ +class FCFusePass : public Pass { + public: + virtual ~FCFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..87ba417b1a43475f48380009f8e5cd84699b8e40 --- /dev/null +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -0,0 +1,90 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/fc_fuse_pass.h" + +#include + +namespace paddle { +namespace framework { +namespace ir { + +void SetOp(ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetInput("Xs", inputs); + op->SetOutput("Ys", outputs); +} + +// a->OP0->b +// a->OP1->c +// (b, c)->mul->d +// (d, e)->elementwise_add->f +ProgramDesc BuildProgramDesc() { + ProgramDesc prog; + for (auto& v : std::vector({"a", "b", "c", "d", "e", "f"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(proto::VarType::SELECTED_ROWS); + if (v == "c") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", std::vector({"a"}), + std::vector({"c"})); + SetOp(&prog, "mul", std::vector({"b", "c"}), + std::vector({"d"})); + SetOp(&prog, "elementwise_add", std::vector({"d", "e"}), + std::vector({"f"})); + + return prog; +} + +TEST(FCFusePass, basic) { + auto prog = BuildProgramDesc(); + + std::unique_ptr graph(new ir::Graph(prog)); + + auto pass = PassRegistry::Instance().Get("fc_fuse_pass"); + + int pre_nodes = graph->Nodes().size(); + + graph = pass->Apply(std::move(graph)); + + int after_nodes = graph->Nodes().size(); + + // Remove 3 Nodes: MUL,ELEMENTWISE_ADD, mul_out + // Add 1 Node: FC + EXPECT_EQ(pre_nodes - 2, after_nodes); + + // Assert fc op in newly generated graph + int fc_count = 0; + + for (auto* node : graph->Nodes()) { + if (node->IsOp() && node->Op()->Type() == "fc") { + ++fc_count; + } + } + EXPECT_EQ(fc_count, 1); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(fc_fuse_pass); diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 5736a5c4e232698085936303d1f23760649f8245..25e33861c06c9fcd2625e3a4036a04508acbd2ca 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -98,11 +98,13 @@ class Graph { // Create a normal variable with non-null VarDesc. ir::Node *CreateVarNode(VarDesc *var_desc) { + PADDLE_ENFORCE(var_desc); return AddNode(new ir::Node(var_desc)); } // Create a normal runnable operator with OpDesc. ir::Node *CreateOpNode(OpDesc *op_desc) { + PADDLE_ENFORCE(op_desc); return AddNode(new ir::Node(op_desc)); } @@ -134,6 +136,14 @@ class Graph { return ret; } + void RemoveNode(ir::Node *node) { + PADDLE_ENFORCE(node_set_.find(node) != node_set_.end()); + node_set_.erase(node); + nodes_.erase(node); + } + + const ProgramDesc &program() const { return program_; } + private: // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { @@ -143,12 +153,6 @@ class Graph { return node; } - void RemoveNode(ir::Node *node) { - PADDLE_ENFORCE(node_set_.find(node) != node_set_.end()); - node_set_.erase(node); - nodes_.erase(node); - } - // NOTE: program_ shouldn't be exposed to user. const ProgramDesc &program_; std::map attrs_; diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.cc b/paddle/fluid/framework/ir/graph_pattern_detecter.cc index dcc4382792d6a21dc10c37d2173d8cf1989219a5..e197861251fe5c9f98eaaba2a10b4af371dcbcba 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc @@ -25,12 +25,30 @@ namespace paddle { namespace framework { namespace ir { +size_t PDPattern::id_ = 0UL; + PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) { + if (!name.empty()) { + PADDLE_ENFORCE_EQ(node_map_.count(name), 0, + "PDNode's name should be unique, get duplicate [%s]", + name); + } + nodes_.emplace_back(new PDNode(std::move(teller), name)); auto* cur = nodes_.back().get(); + node_map_[name] = cur; return cur; } +PDNode* PDPattern::RetriveNode(const std::string& id) const { + auto it = node_map_.find(id); + if (it == node_map_.end()) { + return nullptr; + } + + return it->second; +} + void PDPattern::AddEdge(PDNode* a, PDNode* b) { PADDLE_ENFORCE(a); PADDLE_ENFORCE(b); @@ -51,15 +69,18 @@ void GraphPatternDetecter::operator()(Graph* graph, } bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) { + VLOG(4) << "mark pdnodes in graph"; if (graph.Nodes().empty()) return false; for (auto& node : GraphTraits::DFS(graph)) { for (const auto& pdnode : pattern_.nodes()) { if (pdnode->Tell(&node)) { + VLOG(4) << "pdnode " << pdnode->name() << " marked"; pdnodes2nodes_[pdnode.get()].insert(&node); } } } + VLOG(3) << pdnodes2nodes_.size() << " nodes marked"; return !pdnodes2nodes_.empty(); } @@ -67,10 +88,20 @@ struct HitGroup { std::unordered_map roles; bool Match(Node* node, PDNode* pat) { + if (nodes_.count(node)) { + if (!roles.count(pat)) return false; + return roles[pat] == node; + } return !roles.count(pat) || roles.at(pat) == node; } - void Register(Node* node, PDNode* pat) { roles[pat] = node; } + void Register(Node* node, PDNode* pat) { + roles[pat] = node; + nodes_.insert(node); + } + + private: + std::unordered_set nodes_; }; // Tell whether Node a links to b. @@ -104,6 +135,7 @@ GraphPatternDetecter::DetectPatterns() { // Extend a PDNode to subgraphs by deducing the connection relations defined // in edges of PDNodes. for (const auto& edge : pattern_.edges()) { + VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); // Each role has two PDNodes, which indicates two roles. // Detect two Nodes that can match these two roles and they are connected. auto& pre_groups = bi_records[step % 2]; @@ -127,6 +159,7 @@ GraphPatternDetecter::DetectPatterns() { } } } + VLOG(3) << "step " << step << " get records: " << cur_groups.size(); } for (auto& group : bi_records[step % 2]) { diff --git a/paddle/fluid/framework/ir/graph_pattern_detecter.h b/paddle/fluid/framework/ir/graph_pattern_detecter.h index 1778bf00000f60e5cf8b2a585bf7e5dae0a582eb..68c39902b5a79bf25ca7f08529a958274ac64e33 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detecter.h +++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h @@ -96,7 +96,8 @@ class PDPattern { void AddEdge(PDNode* a, PDNode* b); - PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = ""); + PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = NewID()); + PDNode* RetriveNode(const std::string& id) const; const std::vector>& nodes() const { return nodes_; } const std::vector& edges() const { return edges_; } @@ -107,8 +108,12 @@ class PDPattern { FRIEND_TEST(PDPattern, NewNode); #endif + static std::string NewID() { return "pdnode-" + std::to_string(id_++); } + std::vector> nodes_; std::vector edges_; + std::unordered_map node_map_; + static size_t id_; }; /* diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc index b1b8d1c586c98a327a8e5b4890ced00022155e6b..cadda49c399a6d65079cacedfea61f4fd580a69a 100644 --- a/paddle/fluid/framework/ir/graph_test.cc +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -200,9 +200,11 @@ TEST(GraphTest, WriteAfterWrite) { ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1])); control_dep2 = n->inputs[1]; ASSERT_EQ(n->inputs.size(), 2); - ASSERT_EQ(control_dep1, control_dep2); } } + ASSERT_NE(control_dep1, nullptr); + ASSERT_NE(control_dep2, nullptr); + ASSERT_EQ(control_dep1, control_dep2); } } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 8cb812d1388bf74d173a4dc7a99561e730f8e95a..e7ff0c1dac134334e3baad88886862ebff0fe367 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -25,6 +25,7 @@ static const char kGraphVizPath[] = "graph_viz_path"; std::unique_ptr GraphVizPass::ApplyImpl( std::unique_ptr graph) const { const std::string graph_viz_path = Get(kGraphVizPath); + VLOG(3) << "draw IR graph viz to " << graph_viz_path; std::unique_ptr fout(new std::ofstream(graph_viz_path)); PADDLE_ENFORCE(fout->good()); std::ostream& sout = *fout; diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f885567da1965b997b2063e06c839af95b43e1e1 --- /dev/null +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -0,0 +1,69 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class InferCleanGraphPass : public Pass { + public: + virtual ~InferCleanGraphPass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + PADDLE_ENFORCE(graph.get()); + + auto is_valid_node = [](Node* x) { + return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); + }; + + std::unordered_set invalid_nodes; + for (auto* node : graph->Nodes()) { + if (is_valid_node(node)) { + invalid_nodes.insert(node); + } + } + + // remove nodes from the graph. + for (auto* node : invalid_nodes) { + graph->RemoveNode(node); + } + + // clean edges. + for (auto* node : graph->Nodes()) { + CleanEdges(&node->inputs, invalid_nodes); + CleanEdges(&node->outputs, invalid_nodes); + } + + return graph; + } + + void CleanEdges(std::vector* nodes, + const std::unordered_set& to_remove) const { + auto it = std::remove_if(nodes->begin(), nodes->end(), + [&](Node* x) { return to_remove.count(x); }); + nodes->erase(it, nodes->end()); + } +}; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(infer_clean_graph_pass, + paddle::framework::ir::InferCleanGraphPass); diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 9c0765ab8ce16733ac021aefc8c7b2bb779319f3..063c70fb7b9c0f9b90d872a70f362459ef149391 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -34,14 +34,15 @@ class Node { explicit Node(VarDesc* var_desc) : name_(var_desc->Name()), - var_desc_(var_desc), + var_desc_(new VarDesc(*var_desc)), op_desc_(nullptr), type_(Type::kVariable) {} explicit Node(OpDesc* op_desc) : name_(op_desc->Type()), var_desc_(nullptr), - op_desc_(op_desc), + op_desc_(new OpDesc(*op_desc)), // TODO(panyx0718) the pointer in the + // original OpDesc might go out. type_(Type::kOperation) {} Type NodeType() const { return type_; } @@ -50,12 +51,12 @@ class Node { VarDesc* Var() { PADDLE_ENFORCE(type_ == Type::kVariable); - return var_desc_; + return var_desc_.get(); } OpDesc* Op() { - PADDLE_ENFORCE(type_ == Type::kOperation); - return op_desc_; + PADDLE_ENFORCE(IsOp()); + return op_desc_.get(); } bool IsOp() const { return type_ == Type::kOperation; } @@ -66,8 +67,8 @@ class Node { protected: const std::string name_; - VarDesc* var_desc_; - OpDesc* op_desc_; + std::unique_ptr var_desc_; + std::unique_ptr op_desc_; Type type_; private: diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index b972efe5b0a6942bf0710755bbabbaf863477931..f1271ddb75f3170756941c72f975a3030dd2368c 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -1,14 +1,18 @@ +cc_library(ir_pass_manager SRCS ir_pass_manager.cc DEPS graph pass) cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph_traits.cc subgraph_splitter.cc + analyzer.cc + helper.cc + # passes fluid_to_data_flow_graph_pass.cc data_flow_graph_to_fluid_pass.cc dfg_graphviz_draw_pass.cc tensorrt_subgraph_pass.cc tensorrt_subgraph_node_mark_pass.cc - analyzer.cc - helper.cc - model_store_pass.cc - DEPS framework_proto proto_desc) -cc_test(test_node SRCS node_tester.cc DEPS analysis gflags glog gtest) + fluid_to_ir_pass.cc + model_store_pass.cc + DEPS framework_proto proto_desc ir_pass_manager graph pass) + +cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) @@ -27,19 +31,30 @@ function (inference_analysis_test TARGET) endif() cc_test(${TARGET} SRCS "${analysis_test_SRCS}" - DEPS analysis + DEPS analysis graph fc_fuse_pass graph_viz_pass infer_clean_graph_pass graph_pattern_detecter pass ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) endfunction(inference_analysis_test) +cc_test(test_analyzer SRCS analyzer_tester.cc DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis + # ir + fc_fuse_pass + graph_viz_pass + infer_clean_graph_pass + graph_pattern_detecter + pass + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model) +#set_tests_properties(test_analyzer PROPERTIES DEPENDS test_word2vec) +#inference_api_test(test_analyzer SRC analyzer_tester.cc ARGS test_word2vec) + inference_analysis_test(test_data_flow_graph SRCS data_flow_graph_tester.cc) inference_analysis_test(test_data_flow_graph_to_fluid_pass SRCS data_flow_graph_to_fluid_pass_tester.cc) +inference_analysis_test(test_fluid_to_ir_pass SRCS fluid_to_ir_pass_tester.cc) inference_analysis_test(test_fluid_to_data_flow_graph_pass SRCS fluid_to_data_flow_graph_pass_tester.cc) inference_analysis_test(test_subgraph_splitter SRCS subgraph_splitter_tester.cc) inference_analysis_test(test_dfg_graphviz_draw_pass SRCS dfg_graphviz_draw_pass_tester.cc) inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_tester.cc) inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) -inference_analysis_test(test_analyzer SRCS analyzer_tester.cc) inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index 912615c945aa0b180498459bed2c1dc9e20f3925..fc8b1f68647cb998b391df6e07aee66135d3309b 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h" #include "paddle/fluid/inference/analysis/model_store_pass.h" #include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" @@ -24,14 +25,15 @@ namespace paddle { -DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, true, +DEFINE_bool(IA_enable_tensorrt_subgraph_engine, false, "Enable subgraph to TensorRT engine for acceleration"); -DEFINE_string(inference_analysis_graphviz_log_root, "./", +DEFINE_bool(IA_enable_ir, false, "Turn on IR support"); + +DEFINE_string(IA_graphviz_log_root, "./", "Graphviz debuger for data flow graphs."); -DEFINE_string(inference_analysis_output_storage_path, "", - "optimized model output path"); +DEFINE_string(IA_output_storage_path, "", "optimized model output path"); namespace inference { namespace analysis { @@ -40,8 +42,34 @@ class DfgPassManagerImpl final : public DfgPassManager { public: DfgPassManagerImpl() { // TODO(Superjomn) set the key with pass reprs. - AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); - if (FLAGS_inference_analysis_enable_tensorrt_subgraph_engine) { + LOG(INFO) + << "-----------------------------------------------------------------"; + if (FLAGS_IA_enable_ir) { + AddPass("fluid-to-ir-pass", new FluidToIrPass); + } else { + AddPass("fluid-to-data-flow-graph", new FluidToDataFlowGraphPass); + } + TryAddTensorRtPass(); + AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); + if (!FLAGS_IA_output_storage_path.empty()) { + AddPass("model-store-pass", new ModelStorePass); + } + LOG(INFO) + << "-----------------------------------------------------------------"; + } + + std::string repr() const override { return "dfg-pass-manager"; } + std::string description() const override { return "DFG pass manager."; } + + private: + void AddPass(const std::string& name, Pass* pass) { + VLOG(3) << "Adding pass " << name; + Register(name, pass); + AddGraphvizDebugerPass(pass); + } + + void TryAddTensorRtPass() { + if (FLAGS_IA_enable_tensorrt_subgraph_engine) { auto trt_teller = [&](const Node* node) { std::unordered_set teller_set( {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax", @@ -60,20 +88,6 @@ class DfgPassManagerImpl final : public DfgPassManager { new TensorRTSubgraphNodeMarkPass(trt_teller)); AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); } - AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); - if (!FLAGS_inference_analysis_output_storage_path.empty()) { - AddPass("model-store-pass", new ModelStorePass); - } - } - - std::string repr() const override { return "dfg-pass-manager"; } - std::string description() const override { return "DFG pass manager."; } - - private: - void AddPass(const std::string& name, Pass* pass) { - LOG(INFO) << "Adding pass " << name; - Register(name, pass); - AddGraphvizDebugerPass(pass); } // Add the graphviz debuger pass if the parent pass has one. diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index c82fdfff86c91b4e07e3c1b80987d3d8d796ad23..a72875d36f45e3bf663a637f735d4d6b93b044d0 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -43,9 +43,10 @@ namespace paddle { // TODO(Superjomn) add a definition flag like PADDLE_WITH_TENSORRT and hide this // flag if not available. -DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); -DECLARE_string(inference_analysis_graphviz_log_root); -DECLARE_string(inference_analysis_output_storage_path); +DECLARE_bool(IA_enable_tensorrt_subgraph_engine); +DECLARE_string(IA_graphviz_log_root); +DECLARE_string(IA_output_storage_path); +DECLARE_bool(IA_enable_ir); namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 24bfb3993cf569561980006b6627b56327dd0f67..3be336dd5c33a867a9d5e86ffa99c5b80fd153af 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -14,14 +14,16 @@ #include "paddle/fluid/inference/analysis/analyzer.h" #include +#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" namespace paddle { namespace inference { namespace analysis { TEST(Analyzer, analysis_without_tensorrt) { - FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; + FLAGS_IA_enable_tensorrt_subgraph_engine = false; Argument argument; argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; @@ -29,13 +31,73 @@ TEST(Analyzer, analysis_without_tensorrt) { } TEST(Analyzer, analysis_with_tensorrt) { - FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; + FLAGS_IA_enable_tensorrt_subgraph_engine = true; Argument argument; argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } +void TestWord2vecPrediction(const std::string& model_path) { + NativeConfig config; + config.model_dir = model_path; + config.use_gpu = false; + config.device = 0; + auto predictor = + ::paddle::CreatePaddlePredictor( + config); + + // One single batch + + int64_t data[4] = {1, 2, 3, 4}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + + // For simplicity, we set all the slots with the same data. + std::vector slots(4, tensor); + std::vector outputs; + CHECK(predictor->Run(slots, &outputs)); + + PADDLE_ENFORCE(outputs.size(), 1UL); + // Check the output buffer size and result of each tid. + PADDLE_ENFORCE(outputs.front().data.length(), 33168UL); + float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815, + 0.000932706}; + const size_t num_elements = outputs.front().data.length() / sizeof(float); + // The outputs' buffers are in CPU memory. + for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + LOG(INFO) << "data: " + << static_cast(outputs.front().data.data())[i]; + PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], + result[i]); + } +} + +// Turn on the IR pass supportion, run a real inference and check the result. +TEST(Analyzer, SupportIRPass) { + FLAGS_IA_enable_ir = true; + FLAGS_IA_enable_tensorrt_subgraph_engine = false; + FLAGS_IA_output_storage_path = "./analysis.out"; + + Argument argument(FLAGS_inference_model_dir); + argument.model_output_store_path.reset(new std::string("./analysis.out")); + + Analyzer analyzer; + analyzer.Run(&argument); + + // Should get the transformed model stored to ./analysis.out + ASSERT_TRUE(PathExists("./analysis.out")); + + // Inference from this path. + TestWord2vecPrediction("./analysis.out"); +} + } // namespace analysis } // namespace inference } // namespace paddle + +USE_PASS(fc_fuse_pass); +USE_PASS(graph_viz_pass); +USE_PASS(infer_clean_graph_pass); diff --git a/paddle/fluid/inference/analysis/data_flow_graph.cc b/paddle/fluid/inference/analysis/data_flow_graph.cc index 7f64bc75ae8ad40a268739cdc36051e76af9f49a..100a7504b8526b3587858dd7783913757ba09895 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph.cc @@ -19,14 +19,16 @@ limitations under the License. */ namespace paddle { namespace inference { namespace analysis { +using ir_node_t = framework::ir::Node; +using ir_graph_t = framework::ir::Graph; // It is a better idea that the inputs and outputs of this graph is set manually // before, but there must be a Pass that helps to prune the unnecessary ops that // do not contribute to the given targets, so in this pass, analysis and get the // inputs and outputs is OK. void DataFlowGraph::Build() { - inputs.clear(); - outputs.clear(); + inputs_.clear(); + outputs_.clear(); std::unordered_set ins; std::unordered_set outs; for (auto &node : nodes.nodes()) { @@ -42,18 +44,140 @@ void DataFlowGraph::Build() { // similarly, the nodes that in outs but not in ins is the graphs' outputs for (auto *in : ins) { if (!outs.count(in)) { - inputs.push_back(in); + inputs_.push_back(in); } } for (auto *out : outs) { - if (!outs.count(out)) { - outputs.push_back(out); + if (!ins.count(out)) { + outputs_.push_back(out); } } Clean(); } +void DataFlowGraph::Build(const framework::proto::ProgramDesc &prog) { + // insert vars + // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id + // will keep updating to its latest alias during the graph-building. + std::unordered_map var2id; + auto &main_block = prog.blocks(framework::kRootBlockIndex); + for (int i = 0; i < main_block.vars_size(); i++) { + const auto &var = main_block.vars(i); + auto *v = nodes.Create(Node::Type::kValue); + v->SetName(var.name()); + v->SetPbDesc(const_cast(static_cast(&var))); + v->SetPbMsg(var.SerializeAsString()); + var2id[var.name()] = v->id(); + } + + // The variables in a SSA can only write once, so if a variable is written + // multiple times(quite common in our ProgramDesc design), multiple alias + // Nodes of this variable will be created, and each will just write once. + + // An set that keep all the names of the variables(the original, not alias) + // that have been written(as outputs). Once an Op's output variable hit the + // set, it should create a new alias and update the global alias for this + // variable. And that make a Data Flow Graph a SSA. + std::unordered_set unique_written_vars; + for (int i = 0; i < main_block.ops_size(); i++) { + const auto &op = main_block.ops(i); + auto *o = nodes.Create(Node::Type::kFunction); + o->SetName(op.type()); + static_cast(o)->SetFuncType(op.type()); + // Link to the original protobuf message's memory, make it easier to + // generate from a data flow graph to fluid ProgramDesc. + o->SetPbDesc(const_cast(static_cast(&op))); + o->SetPbMsg(op.SerializeAsString()); + + // set inputs and outputs + for (int j = 0; j < op.inputs_size(); j++) { + auto &in_var = op.inputs(j); + for (int k = 0; k < in_var.arguments_size(); k++) { + auto *in = nodes.GetMutable(var2id.at(in_var.arguments(k))); + in->outlinks.push_back(o); + o->inlinks.push_back(in); + unique_written_vars.insert(in); + } + } + for (int j = 0; j < op.outputs_size(); j++) { + auto &out_var = op.outputs(j); + for (int k = 0; k < out_var.arguments_size(); k++) { + auto *out = nodes.GetMutable(var2id[out_var.arguments(k)]); + if (unique_written_vars.count(out)) { + // Loop found, for example, a = op(a), use SSA, change to a1 = op(a). + auto *out_alias = nodes.Create(Node::Type::kValue); + out_alias->SetName(out->name()); + out_alias->SetPbDesc(out->pb_desc()); + out_alias->SetPbMsg(out->pb_msg()); + var2id[out_alias->name()] = + out_alias->id(); // update variable's alias Node + LOG(INFO) << "loop found in graph, create SSA alias node [" + << out_alias->repr() << "] for [" << out->repr() << "]"; + out = out_alias; + } + out->inlinks.push_back(o); + o->outlinks.push_back(out); + } + } + } + // Analysis and extract the inputs and outputs of this graph. + Build(); +} + +void DataFlowGraph::Build(const framework::ir::Graph &graph) { + // Create nodes + std::unordered_map ir_node_map; + for (auto *ir_node : graph.Nodes()) { + Node *x{nullptr}; + if (ir_node->IsOp()) { + PADDLE_ENFORCE(ir_node->Op()); + VLOG(4) << "get op " << ir_node << " " << ir_node->Name(); + x = nodes.Create(Node::Type::kFunction); + x->attr("ir_node").Pointer() = ir_node; + PADDLE_ENFORCE(ir_node->Op()->Proto()); + x->SetName(ir_node->Op()->Proto()->type()); + x->SetPbMsg(ir_node->Op()->Proto()->SerializeAsString()); + } else if (ir_node->IsVar()) { + // Not create a Node for IR ControlDepVar, considering Inference currently + // just used in single thread scenerio. + VLOG(4) << "get var " << ir_node->Name(); + x = nodes.Create(Node::Type::kValue); + x->attr("ir_node").Pointer() = ir_node; + x->SetName(ir_node->Name()); + // x->SetPbMsg(ir_node->Var()->Proto()->SerializeAsString()); + } else { + PADDLE_THROW("Failed to create an Node from IR, unknown type"); + } + ir_node_map.emplace(ir_node, x); + } + VLOG(4) << "finish creating Nodes"; + + VLOG(4) << "to create edge"; + // Create links + for (auto *ir_node : graph.Nodes()) { + auto it = ir_node_map.find(ir_node); + // Skip ControlDepVar. + if (it == ir_node_map.end()) continue; + auto *node = it->second; + for (auto *x : ir_node->inputs) { + if (!ir_node_map.count(x)) continue; + node->inlinks.push_back(ir_node_map.at(x)); + } + for (auto *x : ir_node->outputs) { + if (!ir_node_map.count(x)) continue; + node->outlinks.push_back(ir_node_map.at(x)); + } + } + + Build(); + PADDLE_ENFORCE(!inputs_.empty(), + "Can't deduce any inputs from the graph, Is the graph empty?"); + + ir_graph = &graph; + VLOG(3) << "finished build from IR"; +} + void DataFlowGraph::Clean() { for (auto &node : nodes.nodes()) { std::unordered_set inlinks_set(node->inlinks.begin(), @@ -61,11 +185,9 @@ void DataFlowGraph::Clean() { std::unordered_set outlinks_set(node->outlinks.begin(), node->outlinks.end()); if (inlinks_set.size() < node->inlinks.size()) { - LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs"; node->inlinks.assign(inlinks_set.begin(), inlinks_set.end()); } if (outlinks_set.size() < node->outlinks.size()) { - LOG(INFO) << "Clean: node " << node->repr() << " prune duplicate inputs"; node->outlinks.assign(outlinks_set.begin(), outlinks_set.end()); } } @@ -112,10 +234,10 @@ GraphTraits::NodesBFSIterator::NodesBFSIterator( const std::vector &source) : queue_(source.begin(), source.end()) {} -// GraphTraits::NodesBFSIterator::NodesBFSIterator( -// GraphTraits::NodesBFSIterator &&other) noexcept -// : queue_(std::move(other.queue_)), -// visited_(std::move(other.visited_)) {} +GraphTraits::NodesBFSIterator::NodesBFSIterator( + GraphTraits::NodesBFSIterator &&other) noexcept + : queue_(std::move(other.queue_)), + visited_(std::move(other.visited_)) {} GraphTraits::NodesBFSIterator::NodesBFSIterator( const GraphTraits::NodesBFSIterator &other) @@ -159,7 +281,7 @@ bool GraphTraits::NodesBFSIterator::operator==( if (queue_.empty()) return other.queue_.empty(); if ((!queue_.empty()) && (!other.queue_.empty())) { return queue_.front() == other.queue_.front() && - visited_.size() == other.visited_.size(); // here need to check the + visited_.size() == other.visited_.size(); // equality of queue and // visited. Just a light but week implementation. } @@ -174,10 +296,10 @@ GraphTraits::NodesDFSIterator::NodesDFSIterator( for (auto *x : source) stack_.push(x); } -// GraphTraits::NodesDFSIterator::NodesDFSIterator( -// GraphTraits::NodesDFSIterator &&other) noexcept -// : stack_(std::move(other.stack_)), -// visited_(std::move(other.visited_)) {} +GraphTraits::NodesDFSIterator::NodesDFSIterator( + GraphTraits::NodesDFSIterator &&other) noexcept + : stack_(std::move(other.stack_)), + visited_(std::move(other.visited_)) {} GraphTraits::NodesDFSIterator::NodesDFSIterator( const GraphTraits::NodesDFSIterator &other) @@ -339,7 +461,7 @@ ExtractInputAndOutputOfSubGraph(std::vector &graph) { // NOLINT void FilterRedundantOutputOfSubGraph(DataFlowGraph *graph) { std::vector op_nodes; - for (auto &node : GraphTraits(graph).nodes_in_TS()) { + for (auto &node : GraphTraits(*graph).nodes_in_TS()) { if (node.type() == Node::Type::kValue || node.deleted()) { continue; } diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index bb3ec6bbc1d9555386aba8837b019d2511653258..437e097acd24aad384df6712ce0de6106b3b5c65 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -26,6 +26,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/inference/analysis/graph_traits.h" #include "paddle/fluid/inference/analysis/node.h" #include "paddle/fluid/platform/enforce.h" @@ -41,19 +42,43 @@ namespace analysis { */ struct DataFlowGraph { NodeMap nodes; - std::vector inputs; - std::vector outputs; + // inputs and outputs are deduced from the graph. + // Used to interact with IR. + const framework::ir::Graph *ir_graph{nullptr}; // Extract inputs and outputs of the graph. void Build(); + void Build(const framework::proto::ProgramDesc &prog); + + // Build a graph from ir::Graph. + void Build(const framework::ir::Graph &graph); + + // Get an attribute. + AnyAttr &Attr(const std::string &key) { return attrs_[key]; } + // Output a DOT graph file for debug. std::string DotString() const; std::string HumanReadableInfo(bool show_values = true, bool show_functions = true) const; + const std::vector &inputs() const { + PADDLE_ENFORCE(!inputs_.empty(), + "No inputs are deduced, need to Build() first."); + return inputs_; + } + const std::vector &outputs() const { + PADDLE_ENFORCE(!outputs_.empty(), + "No outputs are deduced, need to Build() first."); + return outputs_; + } + private: + mutable std::vector inputs_; + mutable std::vector outputs_; + std::unordered_map attrs_; + // Remove duplicate edges and so on. void Clean(); }; @@ -70,7 +95,7 @@ struct GraphTraits { : public std::iterator { NodesBFSIterator() = default; explicit NodesBFSIterator(const std::vector &source); - // NodesBFSIterator(NodesBFSIterator &&other) noexcept; + NodesBFSIterator(NodesBFSIterator &&other) noexcept; // NOTE Heavy to use. NodesBFSIterator(const NodesBFSIterator &other); @@ -93,8 +118,8 @@ struct GraphTraits { struct NodesDFSIterator : public std::iterator { NodesDFSIterator() = default; - explicit NodesDFSIterator(const std::vector &source); - // NodesDFSIterator(NodesDFSIterator &&other) noexcept; + NodesDFSIterator(const std::vector &source); + NodesDFSIterator(NodesDFSIterator &&other) noexcept; NodesDFSIterator(const NodesDFSIterator &other); Node &operator*(); @@ -116,7 +141,7 @@ struct GraphTraits { struct NodesTSIterator : public std::iterator { NodesTSIterator() = default; - explicit NodesTSIterator(const std::vector &source); + NodesTSIterator(const std::vector &source); NodesTSIterator(NodesTSIterator &&other) : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) { other.cursor_ = 0; @@ -138,7 +163,7 @@ struct GraphTraits { size_t cursor_{0}; }; - explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {} + explicit GraphTraits(const DataFlowGraph &graph) : graph_(graph) {} // default use BFS to visit the nodes. iterator_range nodes() { @@ -156,20 +181,20 @@ struct GraphTraits { private: NodesBFSIterator nodes_bfs_begin() { - return NodesBFSIterator(graph_->inputs); + return NodesBFSIterator(graph_.inputs()); } NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); } NodesDFSIterator nodes_dfs_begin() { - return NodesDFSIterator(graph_->inputs); + return NodesDFSIterator(graph_.inputs()); } NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); } - NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); } + NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_.inputs()); } NodesTSIterator nodes_ts_end() { return NodesTSIterator(); } private: - DataFlowGraph *graph_; + const DataFlowGraph &graph_; }; // Extract the inputs and outputs of a graph. The inputs and outputs of a diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index a881262665f156812da9e1576aa29b05fc398499..1682011c3d8cc9927a4b026b370671798cace625 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/inference/analysis/data_flow_graph.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/inference/analysis/ut_helper.h" namespace paddle { @@ -24,20 +25,18 @@ TEST(DataFlowGraph, BFS) { auto dfg = ProgramDescToDFG(desc); dfg.Build(); - for (auto *in : dfg.inputs) { + for (auto* in : dfg.inputs()) { LOG(INFO) << "inputs: " << in->name() << " " << static_cast(in->type()); } - for (auto *out : dfg.outputs) { + for (auto* out : dfg.outputs()) { LOG(INFO) << "outputs: " << out->name() << " " << static_cast(out->type()); } - GraphTraits trait(&dfg); - auto nodes = trait.nodes(); size_t count = 0; - for (auto it = nodes.begin(); it != nodes.end(); ++it) { - LOG(INFO) << "visiting " << it->name(); + for (auto& node : GraphTraits(dfg).nodes()) { + LOG(INFO) << "visiting " << node.name(); ++count; } ASSERT_EQ(count, dfg.nodes.size()); @@ -45,13 +44,11 @@ TEST(DataFlowGraph, BFS) { TEST(DataFlowGraph, DFS) { auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); - auto dfg = ProgramDescToDFG(desc); - dfg.Build(); - GraphTraits trait(&dfg); - auto nodes = trait.nodes_in_DFS(); + DataFlowGraph dfg; + dfg.Build(desc); size_t count = 0; - for (auto it = nodes.begin(); it != nodes.end(); ++it) { - LOG(INFO) << "visiting " << it->name(); + for (auto& node : GraphTraits(dfg).nodes_in_DFS()) { + LOG(INFO) << "visiting " << node.name(); ++count; } ASSERT_EQ(count, dfg.nodes.size()); @@ -74,21 +71,17 @@ TEST(DataFlowGraph, TS) { DataFlowGraph graph; for (int i = 0; i < 8; i++) { - auto *node = graph.nodes.Create(Node::Type::kValue); + auto* node = graph.nodes.Create(Node::Type::kValue); node->SetName("node-" + std::to_string(i)); } auto add_link = [&](int i, int j) { - Node *source = graph.nodes.GetMutable(i); - Node *target = graph.nodes.GetMutable(j); + Node* source = graph.nodes.GetMutable(i); + Node* target = graph.nodes.GetMutable(j); target->inlinks.push_back(source); source->outlinks.push_back(target); }; - graph.inputs.push_back(graph.nodes.GetMutable(0)); - graph.inputs.push_back(graph.nodes.GetMutable(1)); - graph.inputs.push_back(graph.nodes.GetMutable(2)); - add_link(0, 4); add_link(0, 5); add_link(1, 6); @@ -97,8 +90,9 @@ TEST(DataFlowGraph, TS) { add_link(4, 7); add_link(4, 3); add_link(7, 3); + graph.Build(); - auto its = GraphTraits(&graph).nodes_in_TS(); + auto its = GraphTraits(graph).nodes_in_TS(); std::vector sorted_ids; for (auto it = its.begin(); it != its.end(); ++it) { LOG(INFO) << it->name(); @@ -122,6 +116,50 @@ TEST(DataFlowGraph, TS) { assert_positive_sequence_pair(4, 7); } +TEST(DataFlowGraph, Build_ProgramDesc) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); + DataFlowGraph graph; + graph.Build(desc); + ASSERT_EQ(graph.nodes.size(), 38UL); +} + +void SetOp(framework::ProgramDesc* prog, const std::string& type, + const std::vector& inputs, + const std::vector& outputs) { + auto* op = prog->MutableBlock(0)->AppendOp(); + op->SetType(type); + op->SetInput("Xs", inputs); + op->SetOutput("Xs", outputs); +} + +TEST(DataFlowGraph, Build_IR_Graph) { + framework::ProgramDesc prog; + for (auto& v : std::vector({"a", "b", "c", "d", "e", "f"})) { + auto* var = prog.MutableBlock(0)->Var(v); + var->SetType(framework::proto::VarType::SELECTED_ROWS); + if (v == "c") { + var->SetPersistable(true); + } + } + + SetOp(&prog, "OP0", std::vector({"a"}), + std::vector({"b"})); + SetOp(&prog, "OP1", std::vector({"a"}), + std::vector({"c"})); + SetOp(&prog, "mul", std::vector({"b", "c"}), + std::vector({"d"})); + SetOp(&prog, "elementwise_add", std::vector({"d", "e"}), + std::vector({"f"})); + + DataFlowGraph graph; + + framework::ir::Graph ir_graph(prog); + + graph.Build(ir_graph); + + ASSERT_EQ(graph.nodes.size(), ir_graph.Nodes().size()); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc index ce0639a6162da6347ed130ecb1586c9a2d4071d5..8c7dd146e429a7f5cd28bdd418e457e8ea5680bd 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc @@ -49,18 +49,15 @@ bool DataFlowGraphToFluidPass::Initialize(Argument *argument) { bool DataFlowGraphToFluidPass::Finalize() { return true; } void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { - LOG(INFO) << "graph.inputs " << graph->inputs.size(); - for (auto &node : GraphTraits(graph).nodes_in_TS()) { + // FilterRedundantOutputOfSubGraph(graph); + for (auto &node : GraphTraits(*graph).nodes_in_TS()) { if (node.deleted()) continue; switch (node.type()) { case Node::Type::kFunction: { - LOG(INFO) << "add function " << node.repr(); AddFluidOp(&node); } break; case Node::Type::kFunctionBlock: { - LOG(INFO) << "add engine op " << node.repr() << " , " - << static_cast(&node)->subgraph.size(); AddEngineOp(&node); } break; default: @@ -72,15 +69,27 @@ void DataFlowGraphToFluidPass::Run(DataFlowGraph *graph) { } void DataFlowGraphToFluidPass::AddFluidOp(Node *node) { - auto *ori_op = static_cast(node->pb_desc()); + PADDLE_ENFORCE(node); + PADDLE_ENFORCE(node->IsFunction()); + PADDLE_ENFORCE(node->pb_desc() || !node->pb_msg().empty(), + "node has invalid protobuf repr."); + // currently only the main block is analyzed. + PADDLE_ENFORCE(desc_); auto *main_block = desc_->mutable_blocks(framework::kRootBlockIndex); auto *op = main_block->add_ops(); - *op = *ori_op; // copy the attributes, by default, these will not be changed - // by analysis phrase. - // The inputs and outputs of the existing ops are not changed by tensorrt - // subgraph pass. - // NOTE It might be changed by other passes in the long run. + + if (node->pb_desc()) { + auto *ori_op = static_cast(node->pb_desc()); + *op = + *ori_op; // copy the attributes, by default, these will not be changed + // by analysis phrase. + // The inputs and outputs of the existing ops are not changed by tensorrt + // subgraph pass. + // NOTE It might be changed by other passes in the long run. + } else { + op->ParseFromString(node->pb_msg()); + } } void CreateTrtEngineOp(Node *node, const DataFlowGraph &graph, @@ -215,10 +224,9 @@ void DataFlowGraphToFluidPass::AddEngineOp(Node *node) { framework::BlockDesc block_desc(nullptr, &proto); block_desc.Proto()->set_parent_idx(-1); block_desc.Proto()->set_idx(0); - LOG(INFO) << "origin variable size: " - << argument_->origin_program_desc->blocks(0).vars().size(); - LOG(INFO) << "transformed variable size: " - << block_desc.Proto()->vars().size(); + VLOG(4) << "origin variable size: " + << argument_->origin_program_desc->blocks(0).vars().size(); + VLOG(4) << "transformed variable size: " << block_desc.Proto()->vars().size(); // copy ops. for (auto *node : block_node->subgraph) { @@ -252,7 +260,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { Pass *DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const { return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( - FLAGS_inference_analysis_graphviz_log_root, + FLAGS_IA_graphviz_log_root, "data_flow_graph_to_fluid_graphviz_debugger")); } diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc index c05b0e5d4690d0a447edf63a149903704bc2c9be..648b8f7d6a6ec4bafbad2838c5631e776c8699b1 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.cc @@ -29,7 +29,7 @@ void DFG_GraphvizDrawPass::Run(DataFlowGraph *graph) { auto png_path = dot_path.substr(0, dot_path.size() - 4) + ".png"; std::string message; - LOG(INFO) << "draw to " << png_path; + VLOG(3) << "draw to " << png_path; ExecShellCommand("dot -Tpng " + dot_path + " -o " + png_path, &message); } diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 16d82b5aa1acaf87d1cd78ad5b79faa65143ad7d..51bd0ac42d455f68ac5d70f0ce9703dfad6070d4 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -52,72 +52,7 @@ bool FluidToDataFlowGraphPass::Finalize() { return true; } void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { PADDLE_ENFORCE(graph); PADDLE_ENFORCE(desc_); - // insert vars - // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id - // will keep updating to its latest alias during the graph-building. - std::unordered_map var2id; - auto &main_block = desc_->blocks(framework::kRootBlockIndex); - for (int i = 0; i < main_block.vars_size(); i++) { - const auto &var = main_block.vars(i); - auto *v = graph->nodes.Create(Node::Type::kValue); - v->SetName(var.name()); - v->SetPbDesc(const_cast(static_cast(&var))); - v->SetPbMsg(var.SerializeAsString()); - var2id[var.name()] = v->id(); - } - - // The variables in a SSA can only write once, so if a variable is written - // multiple times(quite common in our ProgramDesc design), multiple alias - // Nodes of this variable will be created, and each will just write once. - - // An set that keep all the names of the variables(the original, not alias) - // that have been written(as outputs). Once an Op's output variable hit the - // set, it should create a new alias and update the global alias for this - // variable. And that make a Data Flow Graph a SSA. - std::unordered_set unique_written_vars; - for (int i = 0; i < main_block.ops_size(); i++) { - const auto &op = main_block.ops(i); - auto *o = graph->nodes.Create(Node::Type::kFunction); - o->SetName(op.type()); - static_cast(o)->SetFuncType(op.type()); - // Link to the original protobuf message's memory, make it easier to - // generate from a data flow graph to fluid ProgramDesc. - o->SetPbDesc(const_cast(static_cast(&op))); - o->SetPbMsg(op.SerializeAsString()); - - // set inputs and outputs - for (int j = 0; j < op.inputs_size(); j++) { - auto &in_var = op.inputs(j); - for (int k = 0; k < in_var.arguments_size(); k++) { - auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); - in->outlinks.push_back(o); - o->inlinks.push_back(in); - unique_written_vars.insert(in); - } - } - for (int j = 0; j < op.outputs_size(); j++) { - auto &out_var = op.outputs(j); - for (int k = 0; k < out_var.arguments_size(); k++) { - auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); - if (unique_written_vars.count(out)) { - // Loop found, for example, a = op(a), use SSA, change to a1 = op(a). - auto *out_alias = graph->nodes.Create(Node::Type::kValue); - out_alias->SetName(out->name()); - out_alias->SetPbDesc(out->pb_desc()); - out_alias->SetPbMsg(out->pb_msg()); - var2id[out_alias->name()] = - out_alias->id(); // update variable's alias Node - LOG(INFO) << "loop found in graph, create SSA alias node [" - << out_alias->repr() << "] for [" << out->repr() << "]"; - out = out_alias; - } - out->inlinks.push_back(o); - o->outlinks.push_back(out); - } - } - } - // Analysis and extract the inputs and outputs of this graph. - graph->Build(); + graph->Build(*desc_); } namespace { @@ -133,7 +68,7 @@ class DFG_DebuggerPass : public DFG_GraphvizDrawPass { Pass *FluidToDataFlowGraphPass::CreateGraphvizDebugerPass() const { return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config( - FLAGS_inference_analysis_graphviz_log_root, "fluid-to-dfg-debuger")); + FLAGS_IA_graphviz_log_root, "fluid-to-dfg-debuger")); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc index d218dcd05015aa4636c16569de4addf4936c8cd5..267a0a84ebf75615e0b390f4a1b3bf3b51793fc7 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -30,7 +30,7 @@ TEST(FluidToDataFlowGraphPass, Test) { ASSERT_EQ(argument.main_dfg->nodes.size(), 38UL); pass.Finalize(); ASSERT_FALSE(argument.main_dfg->DotString().empty()); - EXPECT_FALSE(argument.main_dfg->inputs.empty()); + EXPECT_FALSE(argument.main_dfg->inputs().empty()); } } // namespace analysis diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..073f49752872cbb65fddc74be75ec28d4dd0bbaf --- /dev/null +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h" diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass.h b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..fa3f8d313bbdd6733fa3878dd7023e125b6ced36 --- /dev/null +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass.h @@ -0,0 +1,82 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/inference/analysis/ir_pass_manager.h" +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class FluidToIrPass final : public DataFlowGraphPass { + public: + FluidToIrPass() = default; + + bool Initialize(Argument *argument) override { + ANALYSIS_ARGUMENT_CHECK_FIELD(argument); + if (argument->origin_program_desc) { + LOG(WARNING) << "argument's origin_program_desc is already set, might " + "duplicate called"; + } + // set fluid model program path + if (!argument->fluid_model_program_path) { + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir); + argument->fluid_model_program_path.reset( + new std::string(*argument->fluid_model_dir + "/__model__")); + } + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); + // Load program. + auto program = LoadProgramDesc(*argument->fluid_model_program_path); + argument->origin_program_desc.reset( + new framework::proto::ProgramDesc(program)); + // Create main data flow graph. + if (!argument->main_dfg) { + argument->main_dfg.reset(new DataFlowGraph); + } + // Persist the ProgramDesc in graph's attribute. The IR graph just keep the + // address, will segfault if the original ProgramDesc destroys. + auto &ir_program_p = argument->main_dfg->Attr("ir_program_desc").Pointer(); + ir_program_p = new framework::ProgramDesc(program); + + argument_ = argument; + return true; + } + + bool Finalize() override { return true; } + + void Run(DataFlowGraph *graph) override { + // Call all the IR Passes + IRPassManager ir_passes(*static_cast( + argument_->main_dfg->Attr("ir_program_desc").Pointer())); + ir_passes.Apply(std::vector( + {// Manual update the passes here. + "graph_viz_pass", "infer_clean_graph_pass", "graph_viz_pass", + "fc_fuse_pass", "graph_viz_pass"})); + + PADDLE_ENFORCE(argument_->main_dfg.get()); + argument_->main_dfg->Build(ir_passes.graph()); + // PADDLE_ENFORCE(argument_->main_dfg->IsFullyConnected()); + } + + std::string repr() const override { return "fluid-to-ir-pass"; } + + private: + Argument *argument_{nullptr}; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..af934f261baa3807059ce6ab036545594630df58 --- /dev/null +++ b/paddle/fluid/inference/analysis/fluid_to_ir_pass_tester.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/fluid_to_ir_pass.h" + +#include +#include "paddle/fluid/inference/analysis/ut_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(FluidToIrPass, Test) { + FluidToIrPass pass; + Argument argument(FLAGS_inference_model_dir); + pass.Initialize(&argument); + pass.Run(argument.main_dfg.get()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +USE_PASS(fc_fuse_pass); +USE_PASS(graph_viz_pass); +USE_PASS(infer_clean_graph_pass); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index a0f912b251d5ea29594a7f601d5b2bce91201790..5151e2b69ac199dea136535ba445e890596f6227 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -151,6 +152,23 @@ static framework::proto::ProgramDesc LoadProgramDesc( return program_desc; } +static bool FileExists(const std::string &filepath) { + std::ifstream file(filepath); + bool exists = file.is_open(); + file.close(); + return exists; +} + +static bool PathExists(const std::string &path) { + struct stat statbuf; + if (stat(path.c_str(), &statbuf) != -1) { + if (S_ISDIR(statbuf.st_mode)) { + return true; + } + } + return false; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..d849b637bcf3fe3944ad11680bbe041e19a71e24 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/ir_pass_manager.h" +#include + +namespace paddle { +namespace inference { +namespace analysis { + +IRPassManager::IRPassManager(const ProgramDesc& program) { + graph_.reset(new framework::ir::Graph(program)); +} + +void IRPassManager::Apply(const std::vector& passes) { + graph_->Set("graph_viz_path", new std::string("./1.dot")); + // Apply all the passes + std::string pre_pass; + for (const std::string& pass_name : passes) { + LOG(WARNING) << "Running IR pass [" << pass_name << "]"; + auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); + if (pass_name == "graph_viz_pass") { + std::string dot_file_path = + "ir_" + (pre_pass.empty() ? "origin" : pre_pass) + ".dot"; + pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); + } + graph_ = pass->Apply(std::move(graph_)); + pre_pass = pass_name; + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h new file mode 100644 index 0000000000000000000000000000000000000000..3338e37ecf1c591a631fd829a05b07e562af703e --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines IRPassManager, it helps control the passes in IR. Inference + * phrase will load the model program and parameters from disk, that is quite + * different from the training phase. + * This manager will control the Passes and make the passes in IR work smoothly + * for inference. + */ + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace inference { +namespace analysis { +using framework::ProgramDesc; + +class IRPassManager final { + public: + IRPassManager(const ProgramDesc& program); + + void Apply(const std::vector& passes); + + framework::ir::Graph& graph() const { return *graph_; } + + private: + std::unique_ptr graph_; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc index 1c429176424bd5c1d8fa5e015c19d698f966880e..c313db08875669010ddcca13aa66b383ee6d26f8 100644 --- a/paddle/fluid/inference/analysis/model_store_pass.cc +++ b/paddle/fluid/inference/analysis/model_store_pass.cc @@ -35,19 +35,21 @@ void ModelStorePass::Run(DataFlowGraph *x) { std::stringstream ss; // NOTE these commands only works on linux. ss << "mkdir -p " << *argument_->model_output_store_path; - LOG(INFO) << "run command: " << ss.str(); + VLOG(3) << "run command: " << ss.str(); PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); ss.str(""); ss << "cp " << *argument_->fluid_model_dir << "/*" << " " << *argument_->model_output_store_path; - LOG(INFO) << "run command: " << ss.str(); + VLOG(3) << "run command: " << ss.str(); PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); // Store program PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, "program desc is not transformed, should call " "DataFlowGraphToFluidPass first."); + VLOG(3) << "store analyzed program to " + << *argument_->model_output_store_path; const std::string program_output_path = *argument_->model_output_store_path + "/__model__"; std::ofstream file(program_output_path, std::ios::binary); @@ -58,6 +60,8 @@ void ModelStorePass::Run(DataFlowGraph *x) { file.write(serialized_message.c_str(), serialized_message.size()); } +bool ModelStorePass::Finalize() { return true; } + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h index fac7083925776b6209d49255c9e67b930cb1250b..3a2869e30bd80cfd0756f8e21acb414656620eaa 100644 --- a/paddle/fluid/inference/analysis/model_store_pass.h +++ b/paddle/fluid/inference/analysis/model_store_pass.h @@ -44,6 +44,8 @@ class ModelStorePass : public DataFlowGraphPass { model in the disk, and that model can be reloaded for prediction again.)DD"; } + bool Finalize() override; + private: Argument* argument_{nullptr}; }; diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc index 5f3526dd504e77e58d79b4f675db86a22fd0f26b..d6493fc25edf25003504542f1b01c4105754c8df 100644 --- a/paddle/fluid/inference/analysis/model_store_pass_tester.cc +++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc @@ -30,7 +30,7 @@ TEST(DFG_StorePass, test) { argument.model_output_store_path.reset( new std::string("./_dfg_store_pass_tmp")); // disable storage in alalyzer - FLAGS_inference_analysis_output_storage_path = ""; + FLAGS_IA_output_storage_path = ""; analyzer.Run(&argument); ModelStorePass pass; diff --git a/paddle/fluid/inference/analysis/node.h b/paddle/fluid/inference/analysis/node.h index fb426fb893d12c017deda74fc05016053fbc6b1c..af34156bc2f101465d87cb10e2155745022eb521 100644 --- a/paddle/fluid/inference/analysis/node.h +++ b/paddle/fluid/inference/analysis/node.h @@ -38,7 +38,7 @@ namespace analysis { class NodeMap; // A helper class to maintain the status from Pass. -struct NodeAttr { +struct AnyAttr { using any_t = boost::variant; // NOTE T should be a primary type or a struct combined by several primary @@ -54,10 +54,9 @@ struct NodeAttr { void *&Pointer() { return As(); } std::string &String() { return As(); } - private: template T &As() { - if (type_index_ == typeid(NodeAttr)) { + if (type_index_ == typeid(AnyAttr)) { type_index_ = typeid(T); any_data_ = T(); } else { @@ -68,7 +67,7 @@ struct NodeAttr { private: any_t any_data_; - std::type_index type_index_{typeid(NodeAttr)}; + std::type_index type_index_{typeid(AnyAttr)}; }; /* @@ -105,7 +104,7 @@ class Node { // Get an additional attribute and convert it to T data type. NOTE this will // silently create a new attribute if not exists. - NodeAttr &attr(const std::string &name) const { return attrs_[name]; } + AnyAttr &attr(const std::string &name) const { return attrs_[name]; } int id() const { return id_; } @@ -150,7 +149,7 @@ class Node { Type type_{Type::kNone}; // Mark this node is deleted by some pass. bool deleted_{false}; - mutable std::unordered_map attrs_; + mutable std::unordered_map attrs_; }; class Function; diff --git a/paddle/fluid/inference/analysis/node_tester.cc b/paddle/fluid/inference/analysis/node_tester.cc index 8bbcfff53741772ee3705e2efdf46a1b59ee02ab..9207c15373fb4264ff0e738e93ae88e1c08b554c 100644 --- a/paddle/fluid/inference/analysis/node_tester.cc +++ b/paddle/fluid/inference/analysis/node_tester.cc @@ -21,19 +21,19 @@ namespace inference { namespace analysis { TEST(NodeAttr, bool) { - NodeAttr x; + AnyAttr x; x.Bool() = true; ASSERT_EQ(x.Bool(), true); } TEST(NodeAttr, int32) { - NodeAttr x; + AnyAttr x; x.Int32() = 32; ASSERT_EQ(x.Int32(), 32); } TEST(NodeAttr, string) { - NodeAttr x; + AnyAttr x; x.String() = "Hello"; ASSERT_EQ(x.String(), "Hello"); } diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h index 6806f9ff7dada2c1e2328e1ffbfd225afefcf474..7719c6f5ff3c940948c7bdbcb25513cdf430281b 100644 --- a/paddle/fluid/inference/analysis/pass.h +++ b/paddle/fluid/inference/analysis/pass.h @@ -63,7 +63,7 @@ class Pass { // Human-readable short representation. virtual std::string repr() const = 0; // Human-readable long description. - virtual std::string description() const = 0; + virtual std::string description() const { return "No DOC"; } }; // NodePass process on any Node types. diff --git a/paddle/fluid/inference/analysis/pass_manager.cc b/paddle/fluid/inference/analysis/pass_manager.cc index b428bb22b1f0c5c1a47fc4c46c9070c1ace4a228..cfdca33882ea00a28e3ea51ca5fd77ec9605bf3a 100644 --- a/paddle/fluid/inference/analysis/pass_manager.cc +++ b/paddle/fluid/inference/analysis/pass_manager.cc @@ -22,7 +22,7 @@ namespace analysis { bool PassManager::Initialize(Argument* argument) { argument_ = argument; for (auto& pass : data_) { - LOG(INFO) << "Initializing pass " << pass->repr(); + LOG(WARNING) << "Initializing pass [" << pass->repr() << "]"; if (!pass->Initialize(argument)) { LOG(ERROR) << "Failed to initialize pass [" << pass->repr() << "]"; return false; @@ -33,8 +33,9 @@ bool PassManager::Initialize(Argument* argument) { void DfgPassManager::RunAll() { PADDLE_ENFORCE(argument_); + LOG(INFO) << "Total " << data_.size() << " passes"; for (auto& pass : data_) { - VLOG(4) << "Running pass [" << pass->repr() << "]"; + LOG(WARNING) << "Running pass [" << pass->repr() << "]"; pass->Run(argument_->main_dfg.get()); } } @@ -42,8 +43,7 @@ void DfgPassManager::RunAll() { void NodePassManager::RunAll() { PADDLE_ENFORCE(argument_); PADDLE_ENFORCE(argument_->main_dfg.get()); - auto trait = - GraphTraits(argument_->main_dfg.get()).nodes_in_DFS(); + auto trait = GraphTraits(*argument_->main_dfg).nodes_in_DFS(); for (auto& node : trait) { for (auto& pass : data_) { pass->Run(&node); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter.cc b/paddle/fluid/inference/analysis/subgraph_splitter.cc index 9146c0e45e77b5f120d3be622f74e3008bca2b6f..670a8de667494c655bed15aa3e4ce8265448635a 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc @@ -34,7 +34,7 @@ inline void MarkOutLinksInSubGraph(const Function *func) { } void SubGraphSplitter::MarkNodesInsideSubGraph() { - for (auto &node : GraphTraits(graph_).nodes()) { + for (auto &node : GraphTraits(*graph_).nodes()) { if (node_inside_subgraph_teller_(&node)) { node.attr(kMarkerAttrName).Bool() = true; if (node.type() == Node::Type::kFunction) { @@ -76,7 +76,7 @@ void UnionFindCombine(const node_map_t &node_map, size_t a, size_t b) { std::vector> SubGraphSplitter::ExtractSubGraphs() { std::vector marked_nodes; - for (auto &node : GraphTraits(graph_).nodes_in_TS()) { + for (auto &node : GraphTraits(*graph_).nodes_in_TS()) { if (node.attr(kMarkerAttrName).Bool()) { marked_nodes.push_back(&node); } diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc index f736e385c11add152dc9ab9485bf1de40f80b2f3..9f51fafe0b2a66f9d062a6b751fe7a3bc662ce7c 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc @@ -69,8 +69,8 @@ class DfgDebuggerPass : public DFG_GraphvizDrawPass { }; Pass *TensorRTSubgraphNodeMarkPass::CreateGraphvizDebugerPass() const { - DFG_GraphvizDrawPass::Config config( - FLAGS_inference_analysis_graphviz_log_root, "tensorrt_marked_node"); + DFG_GraphvizDrawPass::Config config(FLAGS_IA_graphviz_log_root, + "tensorrt_marked_node"); return new DfgDebuggerPass(config); } bool TensorRTSubgraphNodeMarkPass::Finalize() { return true; } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0..8f1a72316d6c146ebc9a86ced739ef088a3b4267 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -23,7 +23,7 @@ namespace paddle { DEFINE_string(dirname, "", "Directory of the inference model."); void CompareTensorRTWithFluid(bool enable_tensorrt) { - FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = enable_tensorrt; + FLAGS_IA_enable_tensorrt_subgraph_engine = enable_tensorrt; //# 1. Create PaddlePredictor with a config. NativeConfig config0; diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index ed8e9ed77fb233e40bb78329a246ff724b21c547..68fbde2c09fd9a9e84fd7f1202fe474beb0e81b9 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -9,7 +9,6 @@ function(op_library TARGET) # op_library is a function to create op library. The interface is same as # cc_library. But it handle split GPU/CPU code and link some common library # for ops. - set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) set(cc_srcs) set(cu_srcs) set(hip_cu_srcs) @@ -92,6 +91,7 @@ function(op_library TARGET) endif() endforeach() endif(WIN32) + set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) list(LENGTH op_library_DEPS op_library_DEPS_len) if (${op_library_DEPS_len} GREATER 0) diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index de1a503154deb967eb4389a9f43b86c05626d966..66784f0b5149a7c479a90a407709d993f4a40a8b 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -130,12 +130,13 @@ bool RequestCheckpointHandler::Handle(const std::string& varname, checkpoint_notify_id != -1, "when checkpoint_notify_id = -1, there should be no RPC invoke."); - auto* lt_var = scope->FindVar(LOOKUP_TABLE_PATH)->GetMutable(); + // TODO(tangwei12): find out why scope will be error. + auto* lt_var = scope_->FindVar(LOOKUP_TABLE_PATH)->GetMutable(); lt_var->clear(); lt_var->append(out_var_name); VLOG(4) << "RequestCheckpointHandler update var kLookupTablePath to: " << out_var_name; - executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope); + executor_->RunPreparedContext(checkpoint_prepared_ctx_.get(), scope_); return true; } diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h index dc73cb6f23614504640283af01981d3f69e89126..329d2d129a9ea450cd211f0c6d2ea5e37ff8491d 100644 --- a/paddle/fluid/operators/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise_mul_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/elementwise_op_function.h" +#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { @@ -23,6 +24,37 @@ struct MulFunctor { inline HOSTDEVICE T operator()(T a, T b) const { return a * b; } }; +template +void default_elementwise_mul(const framework::ExecutionContext& ctx, + const framework::Tensor* x, + const framework::Tensor* y, framework::Tensor* z) { + int axis = ctx.Attr("axis"); + ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, + MulFunctor(), z); +} + +template +typename std::enable_if< + std::is_floating_point::value && + std::is_same::value>::type +elementwise_mul(const framework::ExecutionContext& ctx, + const framework::Tensor* x, const framework::Tensor* y, + framework::Tensor* z) { + auto blas = math::GetBlas(ctx); + blas.VMUL(x->numel(), x->data(), y->data(), + z->mutable_data(ctx.GetPlace())); +} + +template +typename std::enable_if< + !std::is_floating_point::value || + !std::is_same::value>::type +elementwise_mul(const framework::ExecutionContext& ctx, + const framework::Tensor* x, const framework::Tensor* y, + framework::Tensor* z) { + default_elementwise_mul(ctx, x, y, z); +} + template class ElementwiseMulKernel : public framework::OpKernel { public: @@ -33,9 +65,11 @@ class ElementwiseMulKernel : public framework::OpKernel { auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - int axis = ctx.Attr("axis"); - ElementwiseComputeEx, DeviceContext, T>(ctx, x, y, axis, - MulFunctor(), z); + if (x->numel() == y->numel()) { + elementwise_mul(ctx, x, y, z); + } else { + default_elementwise_mul(ctx, x, y, z); + } } }; diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 72287ae6ac60f8de5eb62733791b0c9353dbe86b..fa4dec9cf118cef9b836943fd4eae90d23e6218a 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -35,9 +35,14 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { if (ctx->HasInput("Bias")) { auto bias_dims = ctx->GetInputDim("Bias"); - PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim]."); - PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1], - "The shape of Bias must be [1, dim]."); + if (bias_dims.size() == 2) { + PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim]."); + PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1], + "The shape of Bias must be [1, dim]."); + } else if (bias_dims.size() == 1) { + PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1], + "The shape of Bias must be [1, dim]."); + } } PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, "Fully Connected input should be 2-D or 4-D tensor."); diff --git a/paddle/fluid/operators/load_op.cc b/paddle/fluid/operators/load_op.cc index 27e26cb1b5c1e831f05dac299489628b92eaa58c..51219504ffa2a778b56351f759e8a8dfb951ad91 100644 --- a/paddle/fluid/operators/load_op.cc +++ b/paddle/fluid/operators/load_op.cc @@ -92,6 +92,7 @@ class LoadOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); framework::DeserializeFromStream(fin, selectedRows, dev_ctx); + selectedRows->SyncIndex(); } }; diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h index 2558154e0b39a4281bfaa59ba75867589d73be5d..8dcf7c99f3860789dee834787eeb8b7ad4cc3530 100644 --- a/paddle/fluid/operators/math/blas.h +++ b/paddle/fluid/operators/math/blas.h @@ -134,6 +134,9 @@ class Blas { template void VADD(int n, const T* x, const T* y, T* z) const; + template + void VMUL(int n, const T* x, const T* y, T* z) const; + template void VCOPY(int n, const T* x, T* y) const; @@ -202,6 +205,11 @@ class BlasT : private Blas { Base()->template VADD(args...); } + template + void VMUL(ARGS... args) const { + Base()->template VMUL(args...); + } + template void VCOPY(ARGS... args) const { Base()->template VCOPY(args...); diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index bf3382107960dfd8b52f94b421b49022dcb6d291..dc77b6d793702458a22a2f59b68e9d9f2c23b4ff 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -82,6 +82,11 @@ struct CBlas { static void VADD(ARGS... args) { platform::dynload::vsAdd(args...); } + + template + static void VMUL(ARGS... args) { + platform::dynload::vsMul(args...); + } }; template <> @@ -142,6 +147,11 @@ struct CBlas { static void VADD(ARGS... args) { platform::dynload::vdAdd(args...); } + + template + static void VMUL(ARGS... args) { + platform::dynload::vdMul(args...); + } }; #else @@ -199,6 +209,7 @@ struct CBlas { static void SMM_GEMM(...) { PADDLE_THROW("float16 SMM_GEMM not supported on CPU"); } + static void VMUL(...) { PADDLE_THROW("float16 VMUL not supported on CPU"); } #ifdef PADDLE_WITH_MKLML static void GEMM_BATCH(...) { PADDLE_THROW("float16 GEMM_BATCH not supported on CPU"); @@ -374,6 +385,20 @@ void Blas::VADD(int n, const T *x, const T *y, #endif } +template <> +template +void Blas::VMUL(int n, const T *x, const T *y, + T *z) const { +#ifdef PADDLE_WITH_MKLML + CBlas::VMUL(n, x, y, z); +#else + // try to find if openblas support vmul + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +#endif +} + template <> template void Blas::GEMV(bool trans_a, int M, int N, T alpha, diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc index 201a51130d6b6f94104e2dabf9e7facffa672ae0..85de37416b5f24128ee98320a872eafffe967c81 100644 --- a/paddle/fluid/operators/save_op.cc +++ b/paddle/fluid/operators/save_op.cc @@ -142,6 +142,8 @@ class SaveOp : public framework::OperatorBase { std::string filename = lt_var->data(); VLOG(4) << "SaveSelectedRows get File name: " << filename; + MkDirRecursively(DirName(filename).c_str()); + auto &selectedRows = var->Get(); // get device context from pool diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index bf5e0d864495ce3a651a31c9d5a7664fe9eb2396..c32d2603cf76f55a9e723196977b0a70c92d597a 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -81,8 +81,8 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddInput("X", "The source input of scatter op"); AddInput("Ids", "The index input of scatter op where X will be updated"); - AddInput("Updates", "The updated value of updates op"); - AddOutput("Out", "The output of add op"); + AddInput("Updates", "The updated value of scatter op"); + AddOutput("Out", "The output of scatter op"); AddComment(R"DOC( Scatter Operator. @@ -90,7 +90,7 @@ This operator obtains output by updating the input on selected indices on the fi $$ Out = X \\ -Out[Ids] = X[Ids] + Updates +Out[Ids] = Updates $$ )DOC"); diff --git a/paddle/fluid/operators/scatter_op.h b/paddle/fluid/operators/scatter_op.h index 181bb1af5cce7fad228e61e1a76ed66a9bd61b3e..2eefbba9726af4d38b40d91e9242faa2923dca20 100644 --- a/paddle/fluid/operators/scatter_op.h +++ b/paddle/fluid/operators/scatter_op.h @@ -34,9 +34,9 @@ class ScatterOpKernel : public framework::OpKernel { auto *Updates = ctx.Input("Updates"); auto *Out = ctx.Output("Out"); - // In place output: Out = X, Out[Ids] += Updates + // In place output: Out = X, Out[Ids] = Updates framework::TensorCopySync(*X, ctx.GetPlace(), Out); - // Apply ScatterUpdate: Out[index] += Updates[:] + // Apply ScatterUpdate: Out[index] = Updates[:] ScatterAssign(ctx.device_context(), *Updates, *Ids, Out); } }; @@ -55,7 +55,7 @@ class ScatterGradientOpKernel : public framework::OpKernel { // In place gradient: dX = dO framework::TensorCopySync(*dOut, ctx.GetPlace(), dX); dUpdates->mutable_data(ctx.GetPlace()); - // Gradient by Gather: dUpdates += dO[Ids] + // Gradient by Gather: dUpdates = dO[Ids] CPUGather(ctx.device_context(), *dOut, *Ids, dUpdates); } }; diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 25bcda7eedc1ef42f75fb8fd1439f0c8f55015c3..c7c533bd42859c374c4783d43ec4cdd34a6a994a 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -17,10 +17,10 @@ #include #include #include -#include #include // NOLINT #include #include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 77e46fa768b62c277d7b4027de7173e39a5672b4..0103e7a3accf88f3c83f109298010c3c9af3d549 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -15,9 +15,9 @@ limitations under the License. */ #pragma once #include -#include #include // NOLINT #include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index e8f4a82ef132be9e4ec3fb76f11766046a2ff638..b946f46e82af4b09fafff54765b899254a4ec1df 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -17,10 +17,10 @@ limitations under the License. */ #include #include -#include #include // NOLINT #include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/dynload/curand.h b/paddle/fluid/platform/dynload/curand.h index 5b9e0820e0b319fe7a636a57a0029caf038b4db3..2daf1b4215ce1f7f771bbac72bfe103b0b941976 100644 --- a/paddle/fluid/platform/dynload/curand.h +++ b/paddle/fluid/platform/dynload/curand.h @@ -14,9 +14,9 @@ limitations under the License. */ #pragma once #include -#include #include // NOLINT +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index 9e7a616094e184695de521aa035257bde4170a91..15ad4a3b40b1ad13a10dd37449c6f6f3e2029df6 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -14,10 +14,10 @@ limitations under the License. */ #pragma once -#include #include #include // NOLINT #include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace platform { @@ -49,25 +49,27 @@ extern void* mklml_dso_handle; #define MKLML_ROUTINE_EACH(__macro) \ __macro(cblas_sgemm); \ - __macro(cblas_saxpy); \ - __macro(cblas_scopy); \ - __macro(cblas_sgemv); \ - __macro(cblas_sgemm_batch); \ __macro(cblas_dgemm); \ + __macro(cblas_saxpy); \ __macro(cblas_daxpy); \ + __macro(cblas_scopy); \ __macro(cblas_dcopy); \ + __macro(cblas_sgemv); \ __macro(cblas_dgemv); \ - __macro(cblas_dgemm_batch); \ - __macro(vsAdd); \ - __macro(vdAdd); \ __macro(cblas_sgemm_alloc); \ - __macro(cblas_sgemm_pack); \ - __macro(cblas_sgemm_compute); \ - __macro(cblas_sgemm_free); \ __macro(cblas_dgemm_alloc); \ + __macro(cblas_sgemm_pack); \ __macro(cblas_dgemm_pack); \ + __macro(cblas_sgemm_compute); \ __macro(cblas_dgemm_compute); \ + __macro(cblas_sgemm_free); \ __macro(cblas_dgemm_free); \ + __macro(cblas_sgemm_batch); \ + __macro(cblas_dgemm_batch); \ + __macro(vsAdd); \ + __macro(vdAdd); \ + __macro(vsMul); \ + __macro(vdMul); \ __macro(MKL_Set_Num_Threads) MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); diff --git a/paddle/fluid/platform/dynload/nccl.h b/paddle/fluid/platform/dynload/nccl.h index 575516f81870fc9f7b92919ffc20a201cb5cbce8..331ca9908e126d5dbca830457281fbf88fc1df09 100644 --- a/paddle/fluid/platform/dynload/nccl.h +++ b/paddle/fluid/platform/dynload/nccl.h @@ -13,12 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include #include #include // NOLINT - #include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/dynload/warpctc.h b/paddle/fluid/platform/dynload/warpctc.h index d157c1fda789b98f06ad069d2a9c4f421ff82dcd..18ed9956f1841874b27c2493e2f3e22fdfbf0448 100644 --- a/paddle/fluid/platform/dynload/warpctc.h +++ b/paddle/fluid/platform/dynload/warpctc.h @@ -14,10 +14,9 @@ limitations under the License. */ #pragma once -#include #include // NOLINT - #include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" #include "warpctc/include/ctc.h" namespace paddle { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 6c2331b75f64b777adcdca4245d503bb5a52e1a6..a76ba75f9eeb8c3f42fbf7254f629b0960a8f2d8 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -14,9 +14,6 @@ limitations under the License. */ #pragma once -#include // for dladdr -#include // for backtrace - #ifdef __GNUC__ #include // for __cxa_demangle #endif // __GNUC__ @@ -37,6 +34,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/to_string.h" @@ -75,7 +73,7 @@ struct EnforceNotMet : public std::exception { sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl; sout << "PaddlePaddle Call Stacks: " << std::endl; - +#if !defined(_WIN32) void* call_stack[TRACE_STACK_LIMIT]; auto size = backtrace(call_stack, TRACE_STACK_LIMIT); auto symbols = backtrace_symbols(call_stack, size); @@ -95,6 +93,9 @@ struct EnforceNotMet : public std::exception { } } free(symbols); +#else + sout << "Windows not support stack backtrace yet."; +#endif err_str_ = sout.str(); } } diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h new file mode 100644 index 0000000000000000000000000000000000000000..a0a2d29500e7afbe8a9a43f010d5fd2d0c560467 --- /dev/null +++ b/paddle/fluid/platform/port.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#if !defined(_WIN32) +#include // for dladdr +#include // for backtrace +#else +#include +#include + +static void* dlsym(void* handle, const char* symbol_name) { + FARPROC found_symbol; + found_symbol = GetProcAddress((HMODULE)handle, symbol_name); + + if (found_symbol == NULL) { + throw std::runtime_error(std::string(symbol_name) + " not found."); + } + return reinterpret_cast(found_symbol); +} + +#endif diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 89ca4f781273e99bbb83216c238dfc5c88c0a22b..d6a14b3305c5cf2d544f17f39a3812f7f75b8a76 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,19 +1,22 @@ +set(PYBIND_DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method + ) +if(NOT WIN32) +list(APPEND PYBIND_DEPS parallel_executor) +endif() if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc - DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method - parallel_executor + DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB}) else() cc_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc - DEPS pybind python proto_desc memory executor prune profiler feed_fetch_method - parallel_executor + DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB}) - if(NOT APPLE AND NOT ANDROID) + if(NOT APPLE AND NOT ANDROID AND NOT WIN32) target_link_libraries(paddle_pybind rt) - endif(NOT APPLE AND NOT ANDROID) + endif(NOT APPLE AND NOT ANDROID AND NOT WIN32) endif(WITH_AMD_GPU) cc_test(tensor_py_test SRCS tensor_py_test.cc DEPS python) diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 2377ac5f929eb21449689240da3061152a0541f9..62682d10324c7cfe656c9ddb09f1b61ac1772e69 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1363,6 +1363,13 @@ class Program(object): self._current_role = core.op_proto_and_checker_maker.OpRole.Forward self._op_role_var = [] + # for distribute + self._is_distributed = False + self._is_chief = False + self._slice_vars_and_attrs = [] + self._endpoints = [] + self._distributed_lookup_table = None + @property def op_role(self): """ diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 6b67128fbfdb10d1abde4ebe7d663a2685bff109..b3ed094c892c6fce7184d6d98f50ed7d6d1642a3 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -372,6 +372,7 @@ def load_vars(executor, load_vars( executor, dirname=dirname, + main_program=main_program, vars=list(filter(predicate, main_program.list_vars())), filename=filename) else: @@ -403,9 +404,12 @@ def load_vars(executor, inputs={}, outputs={"Out": load_var_list}, attrs={'file_path': os.path.join(dirname, filename)}) - executor.run(load_prog) + # load slice vars on pserver, if have it. + _load_slice_up_vars(executor, dirname, + main_program._slice_vars_and_attrs) + def load_params(executor, dirname, main_program=None, filename=None): """ @@ -659,11 +663,19 @@ def save_inference_model(dirname, save_persistables(executor, dirname, inference_program, params_filename) + # if there is lookup table, the trainer 0 will notify all pserver to save. + if main_program._is_distributed and main_program._is_chief and main_program._distributed_lookup_table: + lookup_table_filename = os.path.join(dirname, "__lookup_table__") + _save_lookup_tables_by_notify(executor, lookup_table_filename, + main_program._distributed_lookup_table, + main_program._endpoints) + def load_inference_model(dirname, executor, model_filename=None, - params_filename=None): + params_filename=None, + pserver_endpoints=None): """ Load inference model from a directory @@ -679,6 +691,10 @@ def load_inference_model(dirname, parameters were saved in a single binary file. If parameters were saved in separate files, set it as 'None'. + pserver_endpoints(list|None): This only need by distributed inference. + When use distributed look up table in training, + We also need it in inference.The parameter is + a list of pserver endpoints. Returns: tuple: The return of this function is a tuple with three elements: @@ -697,12 +713,16 @@ def load_inference_model(dirname, exe = fluid.Executor(fluid.CPUPlace()) path = "./infer_model" + endpoints = ["127.0.0.1:2023","127.0.0.1:2024"] [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(dirname=path, executor=exe) results = exe.run(inference_program, feed={feed_target_names[0]: tensor_img}, fetch_list=fetch_targets) + # if we need lookup table, we will use: + fluid.io.load_inference_model(dirname=path, executor=exe, pserver_endpoints=endpoints) + # In this exsample, the inference program was saved in the # "./infer_model/__model__" and parameters were saved in # separate files in ""./infer_model". @@ -729,6 +749,9 @@ def load_inference_model(dirname, program = Program.parse_from_string(program_desc_str) load_persistables(executor, dirname, program, params_filename) + if pserver_endpoints: + program = _endpoints_replacement(program, pserver_endpoints) + feed_target_names = program.desc.get_feed_target_names() fetch_target_names = program.desc.get_fetch_target_names() fetch_targets = [ @@ -738,6 +761,61 @@ def load_inference_model(dirname, return [program, feed_target_names, fetch_targets] +def _save_lookup_tables_by_notify(executor, dirname, lookup_table, + pserver_endpoints): + """ + This function will send checkpoint notify message from Trainer 0 + to all the pservers. + The checkpoint notify message contains lookup table name, + the absolute path on pserver to save lookup_table. + + Args: + executor(Executor): The executor to run for send checkpoint notify. + dirname(str): The folder where to save. + lookup_table(string): the lookup table name, when use distribute + lookup table, we can get lookup table name by DistributeTranspiler. + table_name + ps_endpoint_list(list): the parameter server ip:port list. + when use distribute lookup table, we can get ps_endpoint_list by + distribute arguments. + Return: + None + + Examples: + .. code-block:: python + + exe = fluid.Executor(fluid.CPUPlace()) + param_path = "./my_paddle_model" + table_name = "share_w" + ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"] + + _save_pserver_vars_by_notify(executor=exe, + dirname=param_path, lookup_table=table_name, + pserver_endpoints=ps_endpoints) + """ + + pserver_notify_program = Program() + pserver_notify_block = pserver_notify_program.global_block() + + attrs = {} + attrs['epmap'] = pserver_endpoints + attrs['dir'] = dirname + attrs['lookup_table'] = lookup_table + + pserver_notify_block.append_op( + type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs) + executor.run(pserver_notify_program) + + +def _endpoints_replacement(program, endpoints): + ENDPOINT_MAP = "epmap" + for op in program.global_block().ops: + if op.has_attr(ENDPOINT_MAP): + op.set_attr(ENDPOINT_MAP, endpoints) + program._sync_with_cpp() + return program + + def get_parameter_value(para, executor): """ Get the LoDTensor value of the given parameter. @@ -799,3 +877,46 @@ def get_parameter_value_by_name(name, executor, program=None): program = default_main_program() var = program.global_block().var(name) return get_parameter_value(var, executor) + + +def _load_slice_up_vars(executor, dirname, slice_vars_and_attrs): + if not slice_vars_and_attrs: + return + + load_prog = Program() + load_block = load_prog.global_block() + + for var_tuple in slice_vars_and_attrs: + orig_var = var_tuple[0] + start = var_tuple[1] + slice_var = var_tuple[2] + end = start + reduce(lambda x, y: x * y, slice_var.shape) + + clone_orig_var = load_block.create_var( + name=orig_var.name, + type=orig_var.type, + shape=orig_var.shape, + dtype=orig_var.dtype, + persistable=True) + + clone_slice_var = load_block.create_var( + name=slice_var.name, + type=slice_var.type, + shape=slice_var.shape, + dtype=slice_var.dtype, + persistable=True) + + load_block.append_op( + type='load', + inputs={}, + outputs={'Out': [clone_orig_var]}, + attrs={'file_path': os.path.join(dirname, clone_orig_var.name)}) + load_block.append_op( + type="slice", + inputs={'Input': clone_orig_var}, + outputs={'Out': clone_slice_var}, + attrs={'axes': [0], + 'starts': [start], + 'ends': [end]}) + + executor.run(load_prog) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index bd2b950cffe646128501772899fea2e09d9bdce3..71592618f540a8f42d9a25dd8a1af5e67a592f21 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -94,6 +94,7 @@ __all__ = [ 'image_resize_short', 'resize_bilinear', 'gather', + 'scatter', 'random_crop', 'mean_iou', 'relu', @@ -5036,6 +5037,47 @@ def gather(input, index): return out +def scatter(input, index, updates, name=None): + """ + **Scatter Layer** + + Output is obtained by updating the input on selected indices on the first + axis. + + .. math:: + + Out = X + Out[Ids] = Updates + + Args: + input (Variable): The source input with rank>=1. + index (Variable): The index input with rank=1. Its dtype should be + int32 or int64 as it is used as indexes. + updates (Variable): The updated value of scatter op. + name (str|None): The output variable name. Default None. + + Returns: + output (Variable): The output is a tensor with the same shape as input. + + Examples: + + .. code-block:: python + + output = fluid.layers.scatter(input, index, updates) + + """ + helper = LayerHelper('scatter', **locals()) + dtype = helper.input_dtype() + out = helper.create_tmp_variable(dtype) + helper.append_op( + type="scatter", + inputs={"X": input, + "Ids": index, + "Updates": updates}, + outputs={"Out": out}) + return out + + @templatedoc() def random_crop(x, shape, seed=None): """ diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index cc4a7de163ac2e52e43131a021160051423e040c..7cd62efda8900c830f43d882a41ab03184ebe594 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -65,7 +65,6 @@ __all__ = [ 'uniform_random_batch_size_like', 'gaussian_random', 'gaussian_random_batch_size_like', - 'scatter', 'sum', 'slice', 'shape', diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py index 722b3e159abf4737b2bb43c7b84e23a3618cda12..85a96c0b53f6bc08687965048d6251265055a6fe 100644 --- a/python/paddle/fluid/tests/unittests/dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/dist_mnist.py @@ -46,7 +46,8 @@ def cnn_model(data): pool_size=2, pool_stride=2, act="relu", - param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant())) + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.3))) conv_pool_2 = fluid.nets.simple_img_conv_pool( input=conv_pool_1, filter_size=5, @@ -54,7 +55,8 @@ def cnn_model(data): pool_size=2, pool_stride=2, act="relu", - param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant())) + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( + value=0.2))) SIZE = 10 input_shape = conv_pool_2.shape @@ -66,8 +68,7 @@ def cnn_model(data): size=SIZE, act="softmax", param_attr=fluid.param_attr.ParamAttr( - initializer=fluid.initializer.NormalInitializer( - loc=0.0, scale=scale, seed=1))) + initializer=fluid.initializer.Constant(value=0.1))) return predict diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 1307ba4e4ad11ef01094c44068d916ff2d442f78..0387e911880256ea6b8efb6f2311bbf4c4f8c0f2 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -129,7 +129,12 @@ class SE_ResNeXt(): input=conv, pool_size=7, pool_type='avg', global_pooling=True) drop = fluid.layers.dropout(x=pool, dropout_prob=0.2) stdv = 1.0 / math.sqrt(drop.shape[1] * 1.0) - out = fluid.layers.fc(input=drop, size=class_dim, act='softmax') + out = fluid.layers.fc( + input=drop, + size=class_dim, + act='softmax', + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0.2))) return out def shortcut(self, input, ch_out, stride): @@ -179,7 +184,7 @@ class SE_ResNeXt(): act=None, # avoid pserver CPU init differs from GPU param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant()), + initializer=fluid.initializer.Constant(value=0.2)), bias_attr=False) return fluid.layers.batch_norm(input=conv, act=act) @@ -228,10 +233,8 @@ class DistSeResneXt2x2(TestDistRunnerBase): lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.Momentum( - # FIXME(typhoonzero): add back LR decay once ParallelExecutor fixed. - #learning_rate=fluid.layers.piecewise_decay( - # boundaries=bd, values=lr), - learning_rate=base_lr, + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) optimizer.minimize(avg_cost) diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index ab4c5c3f368333ac42781aebd495579b5c26f388..239adcb9d5900d4073a6c07cb189ab7503aea86e 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -265,9 +265,9 @@ def main(role="pserver", if __name__ == "__main__": - if len(sys.argv) != 7: + if len(sys.argv) != 8: print( - "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" + "Usage: python dist_transformer.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]" ) role = sys.argv[1] endpoints = sys.argv[2] @@ -275,6 +275,8 @@ if __name__ == "__main__": current_endpoint = sys.argv[4] trainers = int(sys.argv[5]) is_dist = True if sys.argv[6] == "TRUE" else False + # FIXME(typhoonzero): refine this test. + is_async = True if sys.argv[7] == "TRUE" else False main( role=role, endpoints=endpoints, diff --git a/python/paddle/fluid/tests/unittests/test_desc_clone.py b/python/paddle/fluid/tests/unittests/test_desc_clone.py index 88d44e453c7976f5e0fbda2c0871dfabd4bb30aa..fa6b67956259f33b109758c5939ab5729482695a 100644 --- a/python/paddle/fluid/tests/unittests/test_desc_clone.py +++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py @@ -27,6 +27,7 @@ import unittest from multiprocessing import Process import os import signal +import six import collections SEED = 1 @@ -55,7 +56,8 @@ def cnn_model(data): # TODO(dzhwinter) : refine the initializer and random seed settting SIZE = 10 input_shape = conv_pool_2.shape - param_shape = [reduce(lambda a, b: a * b, input_shape[1:], 1)] + [SIZE] + param_shape = [six.moves.reduce(lambda a, b: a * b, input_shape[1:], 1) + ] + [SIZE] scale = (2.0 / (param_shape[0]**2 * SIZE))**0.5 predict = fluid.layers.fc( @@ -108,7 +110,7 @@ def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers): def operator_equal(a, b): - for k, v in a.__dict__.iteritems(): + for k, v in six.iteritems(a.__dict__): if isinstance(v, fluid.framework.Program) or \ isinstance(v, fluid.framework.Block): continue @@ -118,8 +120,8 @@ def operator_equal(a, b): raise ValueError("In operator_equal not equal:{0}\n".format(k)) elif isinstance(v, collections.OrderedDict): - v0 = sorted(v.iteritems(), key=lambda x: x[0]) - v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0]) + v0 = sorted(six.iteritems(v), key=lambda x: x[0]) + v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0]) if v0 != v1: raise ValueError("In operator_equal not equal:{0}\n".format(k)) @@ -131,7 +133,7 @@ def operator_equal(a, b): def block_equal(a, b): - for k, v in a.__dict__.iteritems(): + for k, v in six.iteritems(a.__dict__): if isinstance(v, core.ProgramDesc) or isinstance( v, fluid.framework.Program) or isinstance(v, core.BlockDesc): continue @@ -143,8 +145,8 @@ def block_equal(a, b): assert (len(a.ops) == len(b.ops)) elif isinstance(v, collections.OrderedDict): - v0 = sorted(v.iteritems(), key=lambda x: x[0]) - v1 = sorted(b.__dict__[k].iteritems(), key=lambda x: x[0]) + v0 = sorted(six.iteritems(v), key=lambda x: x[0]) + v1 = sorted(six.iteritems(b.__dict__[k]), key=lambda x: x[0]) if v0 != v1: raise ValueError("In block_equal not equal:{0}\n".format(k)) @@ -156,7 +158,7 @@ def block_equal(a, b): def program_equal(a, b): - for k, v in a.__dict__.iteritems(): + for k, v in six.iteritems(a.__dict__): if isinstance(v, core.ProgramDesc): continue diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 4c71181d0d736bd1a8796b2d38ed1667557e3db8..0e815c91446b285ba2c2c5aa9ad18d97f51eae65 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -30,7 +30,7 @@ class TestDistRunnerBase(object): "get_model should be implemented by child classes.") def get_transpiler(self, trainer_id, main_program, pserver_endpoints, - trainers): + trainers, sync_mode): # NOTE: import fluid until runtime, or else forking processes will cause error. import paddle import paddle.fluid as fluid @@ -39,17 +39,22 @@ class TestDistRunnerBase(object): trainer_id=trainer_id, program=main_program, pservers=pserver_endpoints, - trainers=trainers) + trainers=trainers, + sync_mode=sync_mode) return t - def run_pserver(self, pserver_endpoints, trainers, current_endpoint, - trainer_id): + def run_pserver(self, + pserver_endpoints, + trainers, + current_endpoint, + trainer_id, + sync_mode=True): import paddle import paddle.fluid as fluid self.get_model(batch_size=2) t = self.get_transpiler(trainer_id, fluid.default_main_program(), pserver_endpoints, - trainers) + trainers, sync_mode) pserver_prog = t.get_pserver_program(current_endpoint) startup_prog = t.get_startup_program(current_endpoint, pserver_prog) place = fluid.CPUPlace() @@ -57,7 +62,13 @@ class TestDistRunnerBase(object): exe.run(startup_prog) exe.run(pserver_prog) - def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): + def run_trainer(self, + place, + endpoints, + trainer_id, + trainers, + is_dist=True, + sync_mode=True): import paddle import paddle.fluid as fluid test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ @@ -65,7 +76,7 @@ class TestDistRunnerBase(object): if is_dist: t = self.get_transpiler(trainer_id, fluid.default_main_program(), endpoints, - trainers) + trainers, sync_mode) trainer_prog = t.get_trainer_program() else: trainer_prog = fluid.default_main_program() @@ -106,9 +117,9 @@ def runtime_main(test_class): import paddle.fluid as fluid import paddle.fluid.core as core - if len(sys.argv) != 7: + if len(sys.argv) != 8: print( - "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist]" + "Usage: python dist_se_resnext.py [pserver/trainer] [endpoints] [trainer_id] [current_endpoint] [trainers] [is_dist] [sync_mode]" ) role = sys.argv[1] endpoints = sys.argv[2] @@ -116,34 +127,43 @@ def runtime_main(test_class): current_endpoint = sys.argv[4] trainers = int(sys.argv[5]) is_dist = True if sys.argv[6] == "TRUE" else False + sync_mode = True if sys.argv[7] == "TRUE" else False model = test_class() if role == "pserver": - model.run_pserver(endpoints, trainers, current_endpoint, trainer_id) + model.run_pserver(endpoints, trainers, current_endpoint, trainer_id, + sync_mode) else: p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda( ) else fluid.CPUPlace() - model.run_trainer(p, endpoints, trainer_id, trainers, is_dist) + model.run_trainer(p, endpoints, trainer_id, trainers, is_dist, + sync_mode) import paddle.compat as cpt class TestDistBase(unittest.TestCase): + def _setup_config(self): + raise NotImplementedError("tests should have _setup_config implemented") + def setUp(self): self._trainers = 2 self._pservers = 2 self._ps_endpoints = "127.0.0.1:9123,127.0.0.1:9124" self._python_interp = "python" + self._sync_mode = True + self._setup_config() def start_pserver(self, model_file, check_error_log): + sync_mode_str = "TRUE" if self._sync_mode else "FALSE" ps0_ep, ps1_ep = self._ps_endpoints.split(",") - ps0_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ + ps0_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \ (self._python_interp, model_file, self._ps_endpoints, ps0_ep, - self._trainers) - ps1_cmd = "%s %s pserver %s 0 %s %d TRUE" % \ + self._trainers, sync_mode_str) + ps1_cmd = "%s %s pserver %s 0 %s %d TRUE %s" % \ (self._python_interp, model_file, self._ps_endpoints, ps1_ep, - self._trainers) + self._trainers, sync_mode_str) ps0_pipe = subprocess.PIPE ps1_pipe = subprocess.PIPE @@ -195,9 +215,10 @@ class TestDistBase(unittest.TestCase): # Run local to get a base line env_local = {"CUDA_VISIBLE_DEVICES": "0"} env_local.update(required_envs) - local_cmd = "%s %s trainer %s 0 %s %d FLASE" % \ + sync_mode_str = "TRUE" if self._sync_mode else "FALSE" + local_cmd = "%s %s trainer %s 0 %s %d FLASE %s" % \ (self._python_interp, model_file, - "127.0.0.1:1234", "127.0.0.1:1234", 1) + "127.0.0.1:1234", "127.0.0.1:1234", 1, sync_mode_str) if not check_error_log: local_proc = subprocess.Popen( local_cmd.split(" "), @@ -226,12 +247,12 @@ class TestDistBase(unittest.TestCase): self._wait_ps_ready(ps1.pid) ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr0_cmd = "%s %s trainer %s 0 %s %d TRUE" % \ + tr0_cmd = "%s %s trainer %s 0 %s %d TRUE %s" % \ (self._python_interp, model_file, self._ps_endpoints, ps0_ep, - self._trainers) - tr1_cmd = "%s %s trainer %s 1 %s %d TRUE" % \ + self._trainers, sync_mode_str) + tr1_cmd = "%s %s trainer %s 1 %s %d TRUE %s" % \ (self._python_interp, model_file, self._ps_endpoints, ps1_ep, - self._trainers) + self._trainers, sync_mode_str) env0 = {"CUDA_VISIBLE_DEVICES": "0"} env1 = {"CUDA_VISIBLE_DEVICES": "1"} diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 4ec68d411b0f0e9ae89b107914e8fd844a19228b..36bab6f04603b7ad3218603489eead859bfcb5b6 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -17,10 +17,21 @@ import unittest from test_dist_base import TestDistBase -class TestDistSeResneXt2x2(TestDistBase): +class TestDistMnist2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + def test_se_resnext(self): self.check_with_place("dist_mnist.py", delta=1e-7) +class TestDistMnistAsync(TestDistBase): + def _setup_config(self): + self._sync_mode = False + + def test_se_resnext(self): + self.check_with_place("dist_mnist.py", delta=200) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index 16525f6fdb60a90a44a628fb0648f4130218c102..c0e9fa38e7d1eadd89eff9a8ba4442f888b8120e 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -18,9 +18,20 @@ from test_dist_base import TestDistBase class TestDistSeResneXt2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + def test_se_resnext(self): self.check_with_place("dist_se_resnext.py", delta=1e-7) +class TestDistSeResneXt2x2Async(TestDistBase): + def _setup_config(self): + self._sync_mode = False + + def test_se_resnext(self): + self.check_with_place("dist_se_resnext.py", delta=100) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transformer.py b/python/paddle/fluid/tests/unittests/test_dist_transformer.py index 313207ff9ce054f81322224cb6ceafaaf25bbedf..62fcf5953f93637a20beed649de21476a8673419 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py @@ -19,6 +19,9 @@ from test_dist_base import TestDistBase class TestDistTransformer2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + def test_transformer(self): # TODO(paddle-dev): check if the delta is OK. # Usually start around ~8000 and converge to ~5000 diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 03083c9960636886ffa49137b9e9d4a71fbd72fe..9f04d290f7596a60d5fdfa66cbc4beec1c3fe93d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -47,7 +47,6 @@ class TranspilerTest(unittest.TestCase): avg_cost = fluid.layers.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) sgd_optimizer.minimize(avg_cost) - return def get_main_program(self): main = fluid.Program() @@ -95,8 +94,9 @@ class TranspilerTest(unittest.TestCase): def test_transpiler(self): main = fluid.Program() startup = fluid.Program() - with fluid.program_guard(main, startup): - self.transpiler_test_impl() + with fluid.unique_name.guard(): + with fluid.program_guard(main, startup): + self.transpiler_test_impl() class TestBasicModel(TranspilerTest): @@ -249,7 +249,6 @@ class TestLRDecay(TranspilerTest): decay_rate=0.1, staircase=True)) sgd_optimizer.minimize(avg_cost) - return def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) @@ -279,7 +278,6 @@ class TestLRDecayConditional(TranspilerTest): learning_rate=fluid.layers.piecewise_decay([10000, 20000], [1.0, 0.5, 1.0])) sgd_optimizer.minimize(avg_cost) - return def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) @@ -328,7 +326,6 @@ class TestL2Decay(TranspilerTest): avg_cost = fluid.layers.mean(cost) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1) sgd_optimizer.minimize(avg_cost) - return def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) @@ -363,7 +360,6 @@ class TestL2DecayWithPiecewise(TranspilerTest): momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) sgd_optimizer.minimize(avg_cost) - return def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) @@ -393,13 +389,14 @@ class TestDistLookupTableBase(TranspilerTest): def network_with_table(self, is_sparse, is_distributed): self.table_size = 1000 self.emb_size = 64 + self.lookup_table_name = 'shared_w' def emb_pool(ids): emb = fluid.layers.embedding( input=ids, size=[self.table_size, self.emb_size], dtype='float32', - param_attr='shared_w', # share parameter + param_attr=self.lookup_table_name, # share parameter is_sparse=is_sparse, is_distributed=is_distributed) pool = fluid.layers.sequence_pool(input=emb, pool_type='average') @@ -572,7 +569,7 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase): def transpiler_test_impl(self): config = fluid.DistributeTranspilerConfig() - pserver1, startup1 = self.get_pserver(self.pserver1_ep, config) + pserver1, _ = self.get_pserver(self.pserver1_ep, config) self.assertTrue(self.transpiler.has_distributed_lookup_table) lookup_table_var = pserver1.global_block().vars[ @@ -582,6 +579,21 @@ class TestDistLookupTableSliceSize(TestDistLookupTableBase): self.assertEqual(row_size, calc_row_size) +class TestDistArgsInProgram(TestDistLookupTableBase): + def net_conf(self): + self.network_with_table(is_sparse=True, is_distributed=True) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + + self.assertTrue(trainer._is_distributed) + self.assertTrue(trainer._is_chief) + self.assertEqual(trainer._distributed_lookup_table, + self.lookup_table_name) + self.assertEqual(trainer._endpoints, + [self.pserver1_ep, self.pserver2_ep]) + + class TestRMSPropOptimizer(TranspilerTest): def net_conf(self): x = fluid.layers.data(name='x', shape=[1000], dtype='float32') @@ -595,7 +607,6 @@ class TestRMSPropOptimizer(TranspilerTest): avg_cost = fluid.layers.mean(cost) optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) optimizer.minimize(avg_cost) - return def transpiler_test_impl(self): pserver, startup = self.get_pserver(self.pserver1_ep) @@ -612,5 +623,40 @@ class TestRMSPropOptimizer(TranspilerTest): self.assertEqual(moment_var.shape, (500, 1000)) +class TestLoadSliceVar(TranspilerTest): + def net_conf(self): + x = fluid.layers.data(name='x', shape=[1000], dtype='float32') + y_predict = fluid.layers.fc(input=x, + size=1000, + act=None, + param_attr=fluid.ParamAttr(name='fc_w'), + bias_attr=fluid.ParamAttr(name='fc_b')) + y = fluid.layers.data(name='y', shape=[1], dtype='float32') + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_cost = fluid.layers.mean(cost) + optimizer = fluid.optimizer.RMSProp(learning_rate=0.1) + optimizer.minimize(avg_cost) + + def transpiler_test_impl(self): + pserver, _ = self.get_pserver(self.pserver1_ep) + pserver2, _ = self.get_pserver(self.pserver2_ep) + + self.assertTrue(pserver._slice_vars_and_attrs) + self.assertTrue(pserver2._slice_vars_and_attrs) + + for idx in xrange(len(pserver._slice_vars_and_attrs)): + self.assertEqual(pserver._slice_vars_and_attrs[idx][0], + pserver2._slice_vars_and_attrs[idx][0]) + + total_numel = reduce(lambda x, y: x * y, + pserver._slice_vars_and_attrs[idx][0].shape) + self.assertEqual( + total_numel, + reduce(lambda x, y: x * y, + pserver._slice_vars_and_attrs[idx][2].shape) + reduce( + lambda x, y: x * y, + pserver2._slice_vars_and_attrs[idx][2].shape)) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py index e43992c488d35d1b3f670e13650d420b0498eeec..38af149ad336fcb818c3cbc9c686bcbdf00238be 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py +++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py @@ -18,9 +18,20 @@ from test_dist_base import TestDistBase class TestDistSeResneXt2x2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + def test_se_resnext(self): self.check_with_place("dist_word2vec.py", delta=1e-7) +class TestDistSeResneXt2x2Async(TestDistBase): + def _setup_config(self): + self._sync_mode = False + + def test_se_resnext(self): + self.check_with_place("dist_word2vec.py", delta=1) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index e833a7db482db3eb69ef8596c468c535dee02370..8e707c8b00b7bf3c5ea77c18c18135e89ffab9c7 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -347,6 +347,25 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(loss) print(str(program)) + def test_scatter(self): + program = Program() + with program_guard(program): + x = layers.data( + name='x', + shape=[3, 3], + append_batch_size=False, + dtype='float32') + idx = layers.data( + name='idx', shape=[2], append_batch_size=False, dtype='int32') + updates = layers.data( + name='updates', + shape=[2, 3], + append_batch_size=False, + dtype='float32') + out = layers.scatter(input=x, index=idx, updates=updates) + self.assertIsNotNone(out) + print(str(program)) + def test_lod_reset(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 57bc2e8a0ba173bb1273a5183340d0b618f0d73c..540eb8c8339981dd727a001c048358895e7b951e 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -215,6 +215,13 @@ class DistributeTranspiler(object): for param_var, grad_var in self.params_grads: self.param_name_to_grad_name[param_var.name] = grad_var.name + # add distributed attrs to program + self.origin_program._is_distributed = True + self.origin_program._endpoints = self.pserver_endpoints + self.origin_program._is_chief = self.trainer_id == 0 + self.origin_program._distributed_lookup_table = self.table_name if self.table_name else None + + # split and create vars, then put splited vars in dicts for later use. # step 1: split and create vars, then put splited vars in dicts for later use. self._init_splited_vars() @@ -369,7 +376,7 @@ class DistributeTranspiler(object): # FIXME(gongwb): delete not need ops. # note that: some parameter is not trainable and those ops can't be deleted. - for varname, splited_var in self.param_var_mapping.iteritems(): + for varname, splited_var in six.iteritems(self.param_var_mapping): # Get the eplist of recv vars eps = [] for var in splited_var: @@ -406,7 +413,7 @@ class DistributeTranspiler(object): RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) - for varname, splited_var in self.param_var_mapping.iteritems(): + for varname, splited_var in six.iteritems(self.param_var_mapping): #add concat ops to merge splited parameters received from parameter servers. if len(splited_var) <= 1: continue @@ -590,6 +597,8 @@ class DistributeTranspiler(object): checkpoint_block_id = self._create_checkpoint_save_block( pserver_program, table_opt_block.idx) + pserver_program._distributed_lookup_table = self.table_name + # NOTE: if has_distributed_lookup_table is False, then prefetch_block will # not be executed, so it's safe to use optimize_block to hold the place if self.has_distributed_lookup_table: @@ -616,6 +625,10 @@ class DistributeTranspiler(object): outputs={}, attrs=attrs) + # add distributed attrs + pserver_program._slice_vars_and_attrs = self._get_slice_vars_and_attrs( + endpoint) + pserver_program._sync_with_cpp() return pserver_program @@ -689,8 +702,31 @@ class DistributeTranspiler(object): inputs=new_inputs, outputs=new_outputs, attrs=op.all_attrs()) + + # add slice vars + s_prog._slice_vars_and_attrs = self._get_slice_vars_and_attrs(endpoint) + return s_prog + def _get_slice_vars_and_attrs(self, endpoint): + slice_vars_and_attrs = [] + block_suffix = "block" + for param in self.param_grad_ep_mapping[endpoint]["params"]: + orig_var_name, block_name, _ = self._get_varname_parts(param.name) + if not block_name: + continue + + block_idx = int(block_name.split(block_suffix)[1]) + orig_var = self.origin_program.global_block().vars[orig_var_name] + + skip_numel = 0 + slice_vars = self.param_var_mapping[orig_var_name] + for slice_var in slice_vars[:block_idx]: + skip_numel += reduce(lambda x, y: x * y, slice_var.shape) + slice_vars_and_attrs.append([orig_var, skip_numel, param]) + + return slice_vars_and_attrs + # ====================== private transpiler functions ===================== def _has_distributed_lookup_table(self): @@ -1209,8 +1245,8 @@ class DistributeTranspiler(object): elif op_type == "momentum": if varkey == "Velocity": return param_shape - elif op_type == "": - if varkey == "Moment": + elif op_type == "rmsprop": + if varkey in ["Moment", "MeanSquare"]: return param_shape elif op_type == "sgd": pass @@ -1289,8 +1325,6 @@ class DistributeTranspiler(object): pserver_block = program.global_block() new_inputs = collections.OrderedDict() - # update param/grad shape first, then other inputs like - # moment can use the updated shape def _get_param_block(opt_op): # param is already created on global program param_block = None @@ -1303,22 +1337,6 @@ class DistributeTranspiler(object): for key in opt_op.input_names: if key == "Grad": new_inputs[key] = merged_var - # For RMSProp optimizer - elif key == "Moment" or key == "MeanSquare": - param_block = _get_param_block(opt_op) - if not param_block: - return - moment_var = origin_program.global_block().vars[opt_op.input( - key)[0]] - tmpvar = pserver_block.create_var( - name=moment_var.name, - persistable=moment_var.persistable, - dtype=moment_var.dtype, - # change to use same shape as param - # TODO(typhoonzero): didn't append .block in the var name, - # may affect checkpoint saving? Need to verify. - shape=param_block.shape) - new_inputs[key] = tmpvar elif key == "Param": param_block = _get_param_block(opt_op) if not param_block: @@ -1346,7 +1364,7 @@ class DistributeTranspiler(object): for key in opt_op.input_names: new_shape = None - if key in ["Param", "Grad", "LearningRate", "Moment", "MeanSquare"]: + if key in ["Param", "Grad", "LearningRate"]: continue var = self.origin_program.global_block().vars[opt_op.input(key)[0]] # update accumulator variable shape diff --git a/python/setup.py.in b/python/setup.py.in index 4a6cddbbea4903f5a65123aa19b7e978b335f32b..786c9f2e39880b68700b8acb94b3d35a48323958 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -159,18 +159,20 @@ if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_LIB}', libs_path) shutil.copy('${MKLML_IOMP_LIB}', libs_path) package_data['paddle.libs']+=['libmklml_intel.so','libiomp5.so'] -if '${WITH_MKLDNN}' == 'ON': - # TODO(typhoonzero): use install_name_tool to patch mkl libs once - # we can support mkl on mac. - # - # change rpath of libmkldnn.so.0, add $ORIGIN/ to it. - # The reason is that all thirdparty libraries in the same directory, - # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so. - command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" - if os.system(command) != 0: - raise Exception("patch libmkldnn.so failed, command: %s" % command) - package_data['paddle.libs']+=['libmkldnn.so.0'] - shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) +if '${CMAKE_BUILD_TYPE}' == 'Release': + # only change rpath in Release mode. + if '${WITH_MKLDNN}' == 'ON': + # TODO(typhoonzero): use install_name_tool to patch mkl libs once + # we can support mkl on mac. + # + # change rpath of libmkldnn.so.0, add $ORIGIN/ to it. + # The reason is that all thirdparty libraries in the same directory, + # thus, libmkldnn.so.0 will find libmklml_intel.so and libiomp5.so. + command = "patchelf --set-rpath '$ORIGIN/' ${MKLDNN_SHARED_LIB}" + if os.system(command) != 0: + raise Exception("patch libmkldnn.so failed, command: %s" % command) + package_data['paddle.libs']+=['libmkldnn.so.0'] + shutil.copy('${MKLDNN_SHARED_LIB}', libs_path) # remove unused paddle/libs/__init__.py os.remove(libs_path+'/__init__.py') package_dir['paddle.libs']=libs_path @@ -179,20 +181,22 @@ package_dir['paddle.libs']=libs_path # The reason is that libwarpctc.so, libiomp5.so etc are in paddle.libs, and # core.so is in paddle.fluid, thus paddle/fluid/../libs will pointer to above libraries. # This operation will fix https://github.com/PaddlePaddle/Paddle/issues/3213 -if "@APPLE@" == "1": - command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" -else: - command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" -if os.system(command) != 0: - raise Exception("patch core.so failed, command: %s" % command) -if '${WITH_FLUID_ONLY}'== 'OFF': - # change rpath of _swig_paddle.so. +if '${CMAKE_BUILD_TYPE}' == 'Release': + # only change rpath in Release mode, since in Debug mode, core.so is too large to be changed. if "@APPLE@" == "1": - command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + command = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" else: - command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so" if os.system(command) != 0: - raise Exception("patch _swig_paddle.so failed, command: %s" % command) + raise Exception("patch core.so failed, command: %s" % command) + if '${WITH_FLUID_ONLY}'== 'OFF': + # change rpath of _swig_paddle.so. + if "@APPLE@" == "1": + command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + else: + command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so" + if os.system(command) != 0: + raise Exception("patch _swig_paddle.so failed, command: %s" % command) setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}',