diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 0527310974fd22e62689d254a29ad5e46ba98566..f10dc860831cb89de3d9d48eef146b37f2f0db26 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -153,6 +153,13 @@ paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_ paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None)) +paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=['input', 'shape', 'dtype', 'input_dim_idx', 'output_dim_idx', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', 0, 0, -1.0, 1.0, 0)) +paddle.fluid.layers.gaussian_random ArgSpec(args=['shape', 'mean', 'std', 'seed', 'dtype', 'use_mkldnn'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32', False)) +paddle.fluid.layers.sampling_id ArgSpec(args=['x', 'min', 'max', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0.0, 1.0, 0, 'float32')) +paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=['input', 'shape', 'input_dim_idx', 'output_dim_idx', 'mean', 'std', 'seed', 'dtype'], varargs=None, keywords=None, defaults=(0, 0, 0.0, 1.0, 0, 'float32')) +paddle.fluid.layers.sum ArgSpec(args=['x', 'use_mkldnn'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.layers.slice ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None) +paddle.fluid.layers.shape ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -224,13 +231,6 @@ paddle.fluid.layers.logical_and ArgSpec(args=[], varargs='args', keywords='kwarg paddle.fluid.layers.logical_or ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_xor ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.logical_not ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.uniform_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.gaussian_random ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sampling_id ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.gaussian_random_batch_size_like ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) -paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None) paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6d8cbe5d9e491555a94e9420995149041213ab79..69c6dd02005902d9ee486a0b7f8f0c9a343be255 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -150,11 +150,10 @@ else() endif() if (NOT WIN32) - cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor - graph graph_viz_pass multi_devices_graph_pass - multi_devices_graph_print_pass multi_devices_graph_check_pass - fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + graph build_strategy + fast_threaded_ssa_graph_executor) endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a8e0c4a3fedfd56e38de7568be6b3f2e76a4b25f..e0a3ef5a9c6c53c42ebea1a41cac0d18a77781b2 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu # device_context reduce_op_handle ) cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) + +cc_library(build_strategy SRCS build_strategy.cc DEPS + graph_viz_pass multi_devices_graph_pass + multi_devices_graph_print_pass multi_devices_graph_check_pass + fuse_elewise_add_act_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a6b497fa897e3882995688bf36704b1d77ea962 --- /dev/null +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -0,0 +1,126 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/details/build_strategy.h" + +#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" +#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelExecutorPassBuilder : public ir::PassBuilder { + public: + explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy) + : ir::PassBuilder(), strategy_(strategy) { + // Add a graph viz pass to record a graph. + if (!strategy_.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + } + + // Add op fusion. + if (strategy.fuse_elewise_add_act_ops_) { + auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass"); + // Add a graph viz pass to record a graph. + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = AppendPass("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); + viz_pass->Set("graph_viz_path", + new std::string(graph_path)); + } + } + + // Convert graph to run on multi-devices. + auto multi_devices_pass = AppendPass("multi_devices_pass"); + multi_devices_pass->SetNotOwned("strategy", + &strategy_); + + // Add a graph print pass to record a graph with device info. + if (!strategy_.debug_graphviz_path_.empty()) { + auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); + multi_devices_print_pass->SetNotOwned( + "debug_graphviz_path", &strategy_.debug_graphviz_path_); + multi_devices_print_pass->Set( + "graph_printer", new details::GraphvizSSAGraphPrinter); + } + + // Verify that the graph is correct for multi-device executor. + AppendPass("multi_devices_check_pass"); + } + + private: + BuildStrategy strategy_; +}; + +std::shared_ptr BuildStrategy::CreatePassesFromStrategy() + const { + pass_builder_.reset(new ParallelExecutorPassBuilder(*this)); + return pass_builder_; +} + +std::unique_ptr BuildStrategy::Apply( + const ProgramDesc &main_program, const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, +#ifdef PADDLE_WITH_CUDA + const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { +#else + const bool use_cuda) const { +#endif + // Create a default one if not initialized by user. + if (!pass_builder_) { + CreatePassesFromStrategy(); + } + + std::unique_ptr graph(new ir::Graph(main_program)); + + for (std::shared_ptr &pass : pass_builder_->AllPasses()) { + if (pass->Type() == "multi_devices_pass") { + pass->Erase("places"); + pass->SetNotOwned>("places", &places); + pass->Erase("loss_var_name"); + pass->SetNotOwned("loss_var_name", &loss_var_name); + pass->Erase("params"); + pass->SetNotOwned>("params", + ¶m_names); + pass->Erase("local_scopes"); + pass->SetNotOwned>("local_scopes", + &local_scopes); +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase("nccl_ctxs"); + pass->SetNotOwned("nccl_ctxs", nctx); +#endif + } + graph = pass->Apply(std::move(graph)); + } + return graph; +} +} // namespace details +} // namespace framework +} // namespace paddle + +USE_PASS(fuse_elewise_add_act_pass); +USE_PASS(graph_viz_pass); +USE_PASS(multi_devices_pass); +USE_PASS(multi_devices_check_pass); +USE_PASS(multi_devices_print_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 77cafa49f18158ea4187908856e10adaa0f916c9..02c4bea16916d58a6d0fce8918f8fceb9ff9356e 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -15,6 +15,17 @@ #pragma once #include +#include + +#include "paddle/fluid/framework/ir/pass_builder.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/nccl_helper.h" +#endif namespace paddle { namespace framework { @@ -57,6 +68,30 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; bool enable_data_balance_{false}; + + // User normally doesn't need to call this API. + // The PassBuilder allows for more customized insert, remove of passes + // from python side. + // A new PassBuilder is created based on configs defined above and + // passes are owned by the PassBuilder. + std::shared_ptr CreatePassesFromStrategy() const; + + // Apply the passes built by the pass_builder_. The passes will be + // applied to the Program and output an ir::Graph. + std::unique_ptr Apply( + const ProgramDesc &main_program, + const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, +#ifdef PADDLE_WITH_CUDA + const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; +#else + const bool use_cuda) const; +#endif + + private: + mutable std::shared_ptr pass_builder_; }; } // namespace details diff --git a/paddle/fluid/framework/details/cow_ptr.h b/paddle/fluid/framework/details/cow_ptr.h index 21f75957be5f33f3dfc09c41fa9a1e1ca590f99e..090517ff3c1822c2e62e61fad05d49e1c8db8573 100644 --- a/paddle/fluid/framework/details/cow_ptr.h +++ b/paddle/fluid/framework/details/cow_ptr.h @@ -20,79 +20,37 @@ namespace paddle { namespace framework { namespace details { -// Change it to thread safe flags if needed. -class ThreadUnsafeOwnershipFlags { +template +class COWPtr { public: - explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {} - - ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete; - ThreadUnsafeOwnershipFlags& operator=( - const ThreadUnsafeOwnershipFlags& other) = delete; - ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default; - - void SetOwnership(bool flag) { flag_ = flag; } - - // Invoke the callback if it is not owned. - template - void AcquireOwnershipOnce(Callback acquire) { - if (!flag_) { - acquire(); - flag_ = true; - } - } + typedef std::shared_ptr RefPtr; private: - bool flag_; -}; + RefPtr m_sp; -// Copy-On-Write pointer. -// It will hold a T* pointer, and only copy once when `MutableData` is invoked. -// -// The template parameter OwnershipFlags should have: -// * a constructor takes a bool. True if own. -// * SetOwnership(bool flag). -// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not -// owned. -// -// https://en.wikipedia.org/wiki/Copy-on-write -template -class COWPtr { public: - // Ctor from raw pointer. - explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {} + COWPtr() : m_sp(nullptr) {} + explicit COWPtr(T* t) : m_sp(t) {} - // Move methods. Steal ownership from origin - COWPtr(COWPtr&& other) - : payload_(other.payload_), ownership_{std::move(other.ownership_)} {} - COWPtr& operator=(COWPtr&& origin) = default; + const T& Data() const { return *m_sp; } - // Copy methods. Not own payload - COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {} - COWPtr& operator=(const COWPtr& other) { - payload_ = other.payload_; - ownership_.SetOwnership(false); - return *this; - } - - // Access read only data. - const T& Data() const { return *payload_; } - - // Access mutable data. If the data is not owned, the data will be copied - // before. T* MutableData() { - ownership_.AcquireOwnershipOnce( - [this] { payload_.reset(new T(*payload_)); }); - return payload_.get(); + DetachIfNotUnique(); + return m_sp.get(); } - private: - // Actual data pointer. - std::shared_ptr payload_; + void DetachIfNotUnique() { + T* tmp = m_sp.get(); + if (!(tmp == nullptr || m_sp.unique())) { + Detach(); + } + } - // Ownership flag. - OwnershipFlags ownership_; + void Detach() { + T* tmp = m_sp.get(); + m_sp = RefPtr(new T(*tmp)); + } }; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/cow_ptr_test.cc b/paddle/fluid/framework/details/cow_ptr_test.cc index d2142af277c0b356d83941b3baab1947cce31dac..5b055d7cb4d127dc20f2cf70869134f24a93d429 100644 --- a/paddle/fluid/framework/details/cow_ptr_test.cc +++ b/paddle/fluid/framework/details/cow_ptr_test.cc @@ -30,6 +30,14 @@ TEST(COWPtr, all) { ASSERT_EQ(ptr2.Data(), 10); } +TEST(COWPtr, change_old) { + COWPtr ptr(new int{0}); + COWPtr ptr2 = ptr; + *ptr.MutableData() = 10; + ASSERT_EQ(ptr2.Data(), 0); + ASSERT_EQ(ptr.Data(), 10); +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 4dca3ceb4569fb708c7a98621c5239acbe217586..9796f2778955ce47fe9fd1b13baa36404b1574c5 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -41,6 +41,8 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") +cc_library(pass_builder SRCS pass_builder.cc DEPS pass) + cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 62f94a1c0e5a300438bbe5fea34b9a07df5d9ebf..c54766d95a61ac1a4b61566c6de62cbc86685a1d 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/ir/graph_helper.h" #include +#include #include -#include "paddle/fluid/framework/ir/graph_helper.h" - namespace paddle { namespace framework { namespace ir { @@ -113,6 +113,74 @@ std::map> BuildOperationAdjList( return adj_list; } +size_t GraphNum(const Graph &graph) { + std::unordered_set nodes = graph.Nodes(); + std::unordered_set visited_nodes; + visited_nodes.reserve(nodes.size()); + std::deque q_nodes; + std::vector> graph_nodes; + std::unordered_set g_nodes; + size_t graph_count = 0; + + auto traverse_nodes = [&visited_nodes, + &q_nodes](const std::vector &nodes) { + std::copy_if( + nodes.begin(), nodes.end(), std::back_inserter(q_nodes), + [&visited_nodes](Node *node) { return !visited_nodes.count(node); }); + }; + + while (visited_nodes.size() != nodes.size()) { + if (!q_nodes.empty()) { + auto cur_node = q_nodes.front(); + q_nodes.pop_front(); + visited_nodes.insert(cur_node); + g_nodes.insert(cur_node); + traverse_nodes(cur_node->inputs); + traverse_nodes(cur_node->outputs); + } else { + ++graph_count; + if (g_nodes.size()) { + graph_nodes.emplace_back(g_nodes); + } + g_nodes.clear(); + for (auto &n : nodes) { + if (visited_nodes.count(n) == 0) { + q_nodes.push_back(n); + break; + } + } + } + } + + if (g_nodes.size()) { + graph_nodes.emplace_back(g_nodes); + } + + if (VLOG_IS_ON(10)) { + VLOG(10) << "graph_num: " << graph_nodes.size(); + for (auto &g_n : graph_nodes) { + VLOG(10) << "graph_nodes: " << g_n.size(); + if (g_n.size() < 10) { + std::stringstream out; + for (auto &node : g_n) { + out << "\nNode: " << node->Name() << " in ["; + for (auto &n : node->inputs) { + out << n->Name() << ", "; + } + out << "], out["; + for (auto &n : node->outputs) { + out << n->Name() << ", "; + } + out << "]"; + } + VLOG(10) << out.str(); + } + } + } + + return graph_count; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index cd6c53a07f8f56781989739d995226bd02b3d3d0..ec46b38c01b8c369ab37b4fbd5497ec120d8db91 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -27,6 +27,8 @@ namespace ir { // Test if the graph contains circle. bool HasCircle(const Graph &graph); +size_t GraphNum(const Graph &graph); + // Topology Sort the operations in the graph from inputs to outputs. // `graph` cannot contain circle. std::vector TopologySortOperations(const Graph &graph); diff --git a/paddle/fluid/framework/ir/graph_helper_test.cc b/paddle/fluid/framework/ir/graph_helper_test.cc index a260dd3da2a7863c06e51aa4feafd824ea254139..cea902809339f9d45b0e2525163f08a3c1c44c95 100644 --- a/paddle/fluid/framework/ir/graph_helper_test.cc +++ b/paddle/fluid/framework/ir/graph_helper_test.cc @@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) { ASSERT_EQ(node_map.at("op2"), 1UL); ASSERT_TRUE(node_map.at("op3") < node_map.at("op5")); } + +void BuildZeroGraph(Graph* g) {} + +void BuildOneGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + v4->outputs.push_back(o5); +} + +void BuildTwoGraphs(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation); + ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation); + ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable); + ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable); + + // o1->v1->o2 + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + // o2->v2->o3 + // o2->v2->o4 + o2->outputs.push_back(v2); + o3->inputs.push_back(v2); + o4->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o3); + v2->outputs.push_back(o4); + // o2->v3->o5 + // o2->outputs.push_back(v3); + o5->inputs.push_back(v3); + // v3->inputs.push_back(o2); + v3->outputs.push_back(o5); + // o3-v4->o5 + o3->outputs.push_back(v4); + // o5->inputs.push_back(v4); + v4->inputs.push_back(o3); + // v4->outputs.push_back(o5); +} + +TEST(GraphHelperTest, GraphNum) { + ProgramDesc prog; + + Graph g(prog); + BuildZeroGraph(&g); + ASSERT_EQ(GraphNum(g), 0); + + Graph g2(prog); + BuildOneGraph(&g2); + ASSERT_EQ(GraphNum(g2), 1); + + Graph g3(prog); + BuildTwoGraphs(&g3); + ASSERT_EQ(GraphNum(g3), 2); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index d7158eba62686be57499df697466797e4034ea8f..6cf405efe63d2bc284c4650771a747b27bb4a9f6 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -19,7 +19,6 @@ namespace paddle { namespace framework { namespace ir { std::unique_ptr Pass::Apply(std::unique_ptr graph) const { - PADDLE_ENFORCE(!applied_, "Pass can only Apply() once."); PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 0f14083d259172f5b5f1ed80c7d38312d711beb5..9570c59cff2a6afeb1c607f7219b7b455974d6ce 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -42,6 +42,8 @@ class Pass { attr_dels_.clear(); } + std::string Type() const { return type_; } + std::unique_ptr Apply(std::unique_ptr graph) const; // Get a reference to the attributed previously set. @@ -52,6 +54,21 @@ class Pass { return *boost::any_cast(attrs_.at(attr_name)); } + bool Has(const std::string &attr_name) const { + return attrs_.find(attr_name) != attrs_.end(); + } + + void Erase(const std::string &attr_name) { + if (!Has(attr_name)) { + return; + } + if (attr_dels_.find(attr_name) != attr_dels_.end()) { + attr_dels_[attr_name](); + attr_dels_.erase(attr_name); + } + attrs_.erase(attr_name); + } + // Set a pointer to the attribute. Pass takes ownership of the attribute. template void Set(const std::string &attr_name, AttrType *attr) { @@ -68,13 +85,15 @@ class Pass { // should delete the attribute. template void SetNotOwned(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass", + attr_name); attrs_[attr_name] = attr; } protected: - virtual std::unique_ptr ApplyImpl( - std::unique_ptr graph) const = 0; + virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + LOG(FATAL) << "Calling virtual Pass not implemented."; + } private: template @@ -89,7 +108,10 @@ class Pass { required_graph_attrs_.insert(attrs.begin(), attrs.end()); } + void RegisterType(const std::string &type) { type_ = type; } + mutable bool applied_{false}; + std::string type_; std::unordered_set required_pass_attrs_; std::unordered_set required_graph_attrs_; std::map attrs_; @@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar { PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), "'%s' is registered more than once.", pass_type); PassRegistry::Instance().Insert( - pass_type, [this]() -> std::unique_ptr { + pass_type, [this, pass_type]() -> std::unique_ptr { std::unique_ptr pass(new PassType()); pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); + pass->RegisterType(pass_type); return pass; }); } diff --git a/paddle/fluid/framework/ir/pass_builder.cc b/paddle/fluid/framework/ir/pass_builder.cc new file mode 100644 index 0000000000000000000000000000000000000000..e0719867b34d13666672b22070ce14dbaf80d85d --- /dev/null +++ b/paddle/fluid/framework/ir/pass_builder.cc @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass_builder.h" + +namespace paddle { +namespace framework { +namespace ir { + +std::shared_ptr PassBuilder::AppendPass(const std::string& pass_type) { + auto pass = ir::PassRegistry::Instance().Get(pass_type); + passes_.emplace_back(pass.release()); + return passes_.back(); +} + +void PassBuilder::RemovePass(size_t idx) { + PADDLE_ENFORCE(passes_.size() > idx); + passes_.erase(passes_.begin() + idx); +} + +std::shared_ptr PassBuilder::InsertPass(size_t idx, + const std::string& pass_type) { + PADDLE_ENFORCE(passes_.size() >= idx); + std::shared_ptr pass( + ir::PassRegistry::Instance().Get(pass_type).release()); + passes_.insert(passes_.begin() + idx, std::move(pass)); + return passes_[idx]; +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_builder.h b/paddle/fluid/framework/ir/pass_builder.h new file mode 100644 index 0000000000000000000000000000000000000000..733d3a3ad1ab8989ea30fe45cd7e1ffe9432de13 --- /dev/null +++ b/paddle/fluid/framework/ir/pass_builder.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class PassBuilder { + public: + PassBuilder() {} + + virtual ~PassBuilder() {} + + // Append a new pass to the end. + std::shared_ptr AppendPass(const std::string& pass_type); + + // Insert a new pass after `idx`. + std::shared_ptr InsertPass(size_t idx, const std::string& pass_type); + + // Remove a new pass at `idx`. + void RemovePass(size_t idx); + + // Returns a list of all passes. + std::vector> AllPasses() const { return passes_; } + + protected: + std::vector> passes_; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc index 5b5011412ed39e033a7a65921e9c64ce2d54c638..6ad7d1df8bdd016b617c820c022ef55f23ba21cd 100644 --- a/paddle/fluid/framework/ir/pass_test.cc +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) { ASSERT_EQ(graph->Get("copy_test_pass_attr"), 2); ASSERT_EQ(graph->Get("copy_test_graph_attr"), 2); - try { - graph = pass->Apply(std::move(graph)); - } catch (paddle::platform::EnforceNotMet e) { - exception = std::string(e.what()); - } - ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos); + // Allow apply more than once. + graph.reset(new Graph(prog)); + graph->Set("test_graph_attr", new int); + graph = pass->Apply(std::move(graph)); pass = PassRegistry::Instance().Get("test_pass"); pass->SetNotOwned("test_pass_attr", &val); diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 7836ecb1272a07a79a70c9cb040335f9a42e5684..77386f4f069489b6ff7b927a281bdc286ff816e0 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -17,10 +17,13 @@ #include #include #include +#include // NOLINT +#include #include - +#include "paddle/fluid/framework/details/cow_ptr.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/memory/memcpy.h" #include "glog/logging.h" @@ -28,206 +31,436 @@ namespace paddle { namespace framework { #if defined(PADDLE_WITH_CUDA) +namespace details { +struct CUDABuffer { + void *data_{nullptr}; + size_t size_{0}; + platform::CUDAPlace place_; + + CUDABuffer() {} + CUDABuffer(platform::Place place, size_t size) + : size_(size), place_(boost::get(place)) { + data_ = memory::Alloc(place_, size); + } + + ~CUDABuffer() { ClearMemory(); } + + CUDABuffer(const CUDABuffer &o) = delete; + CUDABuffer &operator=(const CUDABuffer &o) = delete; + + void Resize(platform::Place place, size_t size) { + ClearMemory(); + place_ = boost::get(place); + data_ = memory::Alloc(place_, size); + PADDLE_ENFORCE_NOT_NULL(data_); + size_ = size; + } + + void Swap(CUDABuffer &o) { + std::swap(data_, o.data_); + std::swap(place_, o.place_); + std::swap(size_, o.size_); + } + + private: + void ClearMemory() const { + if (data_ != nullptr) { + memory::Free(place_, data_); + } + } +}; +} // namespace details + // Vector implements the std::vector interface, and can get Data or // MutableData from any place. The data will be synced implicitly inside. template class Vector { public: using value_type = T; + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; - // Default ctor. Create empty Vector - Vector() { InitEmpty(); } + private: + // The actual class to implement vector logic + class VectorData { + public: + VectorData() : flag_(kDataInCPU) {} + VectorData(size_t count, const T &value) + : cpu_(count, value), flag_(kDataInCPU) {} + VectorData(std::initializer_list init) : cpu_(init), flag_(kDataInCPU) {} + template + explicit VectorData(const std::vector &dat) + : cpu_(dat), flag_(kDataInCPU) {} + ~VectorData() {} + + VectorData(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + } - // Fill vector with value. The vector size is `count`. - explicit Vector(size_t count, const T &value = T()) { - InitEmpty(); - if (count != 0) { - resize(count); - T *ptr = begin(); - for (size_t i = 0; i < count; ++i) { - ptr[i] = value; + VectorData &operator=(const VectorData &o) { + o.ImmutableCPU(); + cpu_ = o.cpu_; + flag_ = kDataInCPU; + details::CUDABuffer null; + gpu_.Swap(null); + return *this; + } + + T &operator[](size_t i) { + MutableCPU(); + return cpu_[i]; + } + + const T &operator[](size_t i) const { + ImmutableCPU(); + return cpu_[i]; + } + + size_t size() const { return cpu_.size(); } + + iterator begin() { + MutableCPU(); + return cpu_.begin(); + } + + iterator end() { + MutableCPU(); + return cpu_.end(); + } + + T &front() { + MutableCPU(); + return cpu_.front(); + } + + T &back() { + MutableCPU(); + return cpu_.back(); + } + + const_iterator begin() const { + ImmutableCPU(); + return cpu_.begin(); + } + + const_iterator end() const { + ImmutableCPU(); + return cpu_.end(); + } + + const T &back() const { + ImmutableCPU(); + return cpu_.back(); + } + + T *data() { return &(*this)[0]; } + + const T *data() const { return &(*this)[0]; } + + const T &front() const { + ImmutableCPU(); + return cpu_.front(); + } + + // assign this from iterator. + // NOTE: the iterator must support `end-begin` + template + void assign(Iter begin, Iter end) { + MutableCPU(); + cpu_.assign(begin, end); + } + + // push_back. If the previous capacity is not enough, the memory will + // double. + void push_back(T elem) { + MutableCPU(); + cpu_.push_back(elem); + } + + // extend a vector by iterator. + // NOTE: the iterator must support end-begin + template + void Extend(It begin, It end) { + MutableCPU(); + auto out_it = std::back_inserter>(this->cpu_); + std::copy(begin, end, out_it); + } + + // resize the vector + void resize(size_t size) { + MutableCPU(); + cpu_.resize(size); + } + + // get cuda ptr. immutable + const T *CUDAData(platform::Place place) const { + PADDLE_ENFORCE(platform::is_gpu_place(place), + "CUDA Data must on CUDA place"); + ImmutableCUDA(place); + return reinterpret_cast(gpu_.data_); + } + + // get cuda ptr. mutable + T *CUDAMutableData(platform::Place place) { + const T *ptr = CUDAData(place); + flag_ = kDirty | kDataInCUDA; + return const_cast(ptr); + } + + // clear + void clear() { + cpu_.clear(); + flag_ = kDirty | kDataInCPU; + } + + size_t capacity() const { return cpu_.capacity(); } + + // reserve data + void reserve(size_t size) const { cpu_.reserve(size); } + + // implicit cast operator. Vector can be cast to std::vector implicitly. + operator std::vector() const { + ImmutableCPU(); + return cpu_; + } + + bool operator==(const VectorData &other) const { + ImmutableCPU(); + other.ImmutableCPU(); + return cpu_ == other.cpu_; + } + + std::mutex &Mutex() const { return mtx_; } + + std::unique_ptr CUDAPlace() const { + if (gpu_.data_ == nullptr) { + return nullptr; + } else { + return std::unique_ptr( + new platform::CUDAPlace(gpu_.place_)); } } - } - // Ctor with init_list - Vector(std::initializer_list init) { - if (init.size() == 0) { - InitEmpty(); - } else { - InitByIter(init.size(), init.begin(), init.end()); + private: + enum DataFlag { + kDataInCPU = 0x01, + kDataInCUDA = 0x02, + // kDirty means the data has been changed in one device. + kDirty = 0x10 + }; + + void CopyToCPU() const { + // COPY GPU Data To CPU + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get( + platform::Place(gpu_.place_))); + auto stream = dev_ctx->stream(); + void *src = gpu_.data_; + void *dst = cpu_.data(); + memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_, + stream); + dev_ctx->Wait(); + } + + void MutableCPU() { + if (IsInCUDA() && IsDirty()) { + CopyToCPU(); + } + flag_ = kDirty | kDataInCPU; } - } + + void ImmutableCUDA(platform::Place place) const { + if (IsDirty()) { + if (IsInCPU()) { + CopyCPUDataToCUDA(place); + UnsetFlag(kDirty); + SetFlag(kDataInCUDA); + } else if (IsInCUDA() && + !(boost::get(place) == gpu_.place_)) { + PADDLE_THROW("This situation should not happen"); + // Still dirty + } else { + // Dirty && DataInCUDA && Device is same + // Do nothing + } + } else { + if (!IsInCUDA()) { + // Even data is not dirty. However, data is not in CUDA. Copy data. + CopyCPUDataToCUDA(place); + SetFlag(kDataInCUDA); + } else if (!(boost::get(place) == gpu_.place_)) { + PADDLE_THROW("This situation should not happen."); + } else { + // Not Dirty && DataInCUDA && Device is same + // Do nothing. + } + } + } + + void CopyCPUDataToCUDA(const platform::Place &place) const { + void *src = cpu_.data(); + gpu_.Resize(place, cpu_.size() * sizeof(T)); + void *dst = gpu_.data_; + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto stream = dev_ctx->stream(); + memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_, + stream); + } + + void ImmutableCPU() const { + if (IsDirty() && !IsInCPU()) { // If data has been changed in CUDA, or + // CPU has no data. + CopyToCPU(); + UnsetFlag(kDirty); + } + SetFlag(kDataInCPU); + } + + void UnsetFlag(int flag) const { flag_ &= ~flag; } + void SetFlag(int flag) const { flag_ |= flag; } + + bool IsDirty() const { return flag_ & kDirty; } + + bool IsInCUDA() const { return flag_ & kDataInCUDA; } + + bool IsInCPU() const { return flag_ & kDataInCPU; } + + mutable std::vector cpu_; + mutable details::CUDABuffer gpu_; + mutable int flag_; + + mutable std::mutex mtx_; + }; + + public: + // Default ctor. Create empty Vector + Vector() : m_(new VectorData()) {} + + // Fill vector with value. The vector size is `count`. + explicit Vector(size_t count, const T &value = T()) + : m_(new VectorData(count, value)) {} + + // Ctor with init_list + Vector(std::initializer_list init) : m_(new VectorData(init)) {} // implicit cast from std::vector. template - Vector(const std::vector &dat) { // NOLINT - if (dat.size() == 0) { - InitEmpty(); - } else { - InitByIter(dat.size(), dat.begin(), dat.end()); - } + Vector(const std::vector &dat) : m_(new VectorData(dat)) { // NOLINT } // Copy ctor - Vector(const Vector &other) { this->operator=(other); } + Vector(const Vector &other) { m_ = other.m_; } // Copy operator Vector &operator=(const Vector &other) { - if (other.size() != 0) { - this->InitByIter(other.size(), other.begin(), other.end()); - } else { - InitEmpty(); - } + m_ = other.m_; return *this; } // Move ctor - Vector(Vector &&other) { - this->size_ = other.size_; - this->flag_ = other.flag_; - if (other.cuda_vec_.memory_size()) { - this->cuda_vec_.ShareDataWith(other.cuda_vec_); - } - if (other.cpu_vec_.memory_size()) { - this->cpu_vec_.ShareDataWith(other.cpu_vec_); - } - } + Vector(Vector &&other) { m_ = std::move(other.m_); } // CPU data access method. Mutable. - T &operator[](size_t i) { - MutableCPU(); - return const_cast(cpu_vec_.data())[i]; - } + T &operator[](size_t i) { return (*m_.MutableData())[i]; } // CPU data access method. Immutable. - const T &operator[](size_t i) const { - ImmutableCPU(); - return cpu_vec_.data()[i]; - } + const T &operator[](size_t i) const { return m_.Data()[i]; } // std::vector iterator methods. Based on CPU data access method - size_t size() const { return size_; } + size_t size() const { return m_.Data().size(); } - T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); } + iterator begin() { return m_.MutableData()->begin(); } - T *end() { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } + iterator end() { return m_.MutableData()->end(); } - T &front() { return *begin(); } + T &front() { return m_.MutableData()->front(); } - T &back() { - auto it = end(); - --it; - return *it; - } + T &back() { return m_.MutableData()->back(); } - const T *begin() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); - } + const_iterator begin() const { return m_.Data().begin(); } - const T *end() const { - return capacity() == 0 ? &EmptyDummy() : &this->operator[](size()); - } + const_iterator end() const { return m_.Data().end(); } - const T *cbegin() const { return begin(); } + const_iterator cbegin() const { return begin(); } - const T *cend() const { return end(); } + const_iterator cend() const { return end(); } - const T &back() const { - auto it = end(); - --it; - return *it; - } + const T &back() const { return m_.Data().back(); } - T *data() { return begin(); } + T *data() { return m_.MutableData()->data(); } - const T *data() const { return begin(); } + const T *data() const { return m_.Data().data(); } - const T &front() const { return *begin(); } + const T &front() const { return m_.Data().front(); } // end of std::vector iterator methods // assign this from iterator. // NOTE: the iterator must support `end-begin` template void assign(Iter begin, Iter end) { - InitByIter(end - begin, begin, end); + m_.MutableData()->assign(begin, end); } // push_back. If the previous capacity is not enough, the memory will // double. - void push_back(T elem) { - if (size_ + 1 > capacity()) { - reserve((size_ + 1) << 1); - } - *end() = elem; - ++size_; - } + void push_back(T elem) { m_.MutableData()->push_back(elem); } // extend a vector by iterator. // NOTE: the iterator must support end-begin template void Extend(It begin, It end) { - size_t pre_size = size_; - resize(pre_size + (end - begin)); - T *ptr = this->begin() + pre_size; - for (; begin < end; ++begin, ++ptr) { - *ptr = *begin; - } + m_.MutableData()->Extend(begin, end); } // resize the vector void resize(size_t size) { - if (size + 1 <= capacity()) { - size_ = size; - } else { - MutableCPU(); - Tensor cpu_tensor; - platform::Place cpu = platform::CPUPlace(); - T *ptr = cpu_tensor.mutable_data( - framework::make_ddim({static_cast(size)}), cpu); - const T *old_ptr = - cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data(); - if (old_ptr != nullptr) { - std::copy(old_ptr, old_ptr + size_, ptr); - } - size_ = size; - cpu_vec_.ShareDataWith(cpu_tensor); + if (m_.Data().size() != size) { + m_.MutableData()->resize(size); } } // get cuda ptr. immutable const T *CUDAData(platform::Place place) const { - PADDLE_ENFORCE(platform::is_gpu_place(place), - "CUDA Data must on CUDA place"); - ImmutableCUDA(place); - return cuda_vec_.data(); + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == nullptr || + *cuda_place == boost::get(place)) { + return m_.Data().CUDAData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAData(place); } // get cuda ptr. mutable T *CUDAMutableData(platform::Place place) { - const T *ptr = CUDAData(place); - flag_ = kDirty | kDataInCUDA; - return const_cast(ptr); + { + auto &mtx = m_.Data().Mutex(); + std::lock_guard guard(mtx); + auto cuda_place = m_.Data().CUDAPlace(); + if (cuda_place == nullptr || + *cuda_place == boost::get(place)) { + return m_.MutableData()->CUDAMutableData(place); + } + } + // If m_ contains CUDAData in a different place. Detach manually. + m_.Detach(); + return CUDAMutableData(place); } // clear - void clear() { - size_ = 0; - flag_ = kDirty | kDataInCPU; - } + void clear() { m_.MutableData()->clear(); } - size_t capacity() const { - return cpu_vec_.memory_size() / SizeOfType(typeid(T)); - } + size_t capacity() const { return m_.Data().capacity(); } // reserve data - void reserve(size_t size) { - size_t pre_size = size_; - resize(size); - resize(pre_size); - } + void reserve(size_t size) { m_.Data().reserve(size); } // the unify method to access CPU or CUDA data. immutable. const T *Data(platform::Place place) const { @@ -248,12 +481,7 @@ class Vector { } // implicit cast operator. Vector can be cast to std::vector implicitly. - operator std::vector() const { - std::vector result; - result.resize(size()); - std::copy(begin(), end(), result.begin()); - return result; - } + operator std::vector() const { return m_.Data(); } bool operator==(const Vector &other) const { if (size() != other.size()) return false; @@ -267,118 +495,11 @@ class Vector { return true; } - private: - void InitEmpty() { - size_ = 0; - flag_ = kDataInCPU; - } - - template - void InitByIter(size_t size, Iter begin, Iter end) { - platform::Place cpu = platform::CPUPlace(); - T *ptr = this->cpu_vec_.template mutable_data( - framework::make_ddim({static_cast(size)}), cpu); - for (size_t i = 0; i < size; ++i) { - *ptr++ = *begin++; - } - flag_ = kDataInCPU | kDirty; - size_ = size; - } - - enum DataFlag { - kDataInCPU = 0x01, - kDataInCUDA = 0x02, - // kDirty means the data has been changed in one device. - kDirty = 0x10 - }; - - void CopyToCPU() const { - // COPY GPU Data To CPU - TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_); - WaitPlace(cuda_vec_.place()); - } - - void MutableCPU() { - if (IsInCUDA() && IsDirty()) { - CopyToCPU(); - } - flag_ = kDirty | kDataInCPU; - } - - void ImmutableCUDA(platform::Place place) const { - if (IsDirty()) { - if (IsInCPU()) { - TensorCopy(cpu_vec_, boost::get(place), - &cuda_vec_); - WaitPlace(place); - UnsetFlag(kDirty); - SetFlag(kDataInCUDA); - } else if (IsInCUDA() && !(place == cuda_vec_.place())) { - framework::Tensor tmp; - TensorCopy(cuda_vec_, boost::get(place), &tmp); - WaitPlace(cuda_vec_.place()); - cuda_vec_.ShareDataWith(tmp); - // Still dirty - } else { - // Dirty && DataInCUDA && Device is same - // Do nothing - } - } else { - if (!IsInCUDA()) { - // Even data is not dirty. However, data is not in CUDA. Copy data. - TensorCopy(cpu_vec_, boost::get(place), - &cuda_vec_); - WaitPlace(place); - SetFlag(kDataInCUDA); - } else if (!(place == cuda_vec_.place())) { - framework::Tensor tmp; - WaitPlace(cuda_vec_.place()); - TensorCopy(cuda_vec_, boost::get(place), &tmp); - WaitPlace(cuda_vec_.place()); - WaitPlace(place); - cuda_vec_.ShareDataWith(tmp); - } else { - // Not Dirty && DataInCUDA && Device is same - // Do nothing. - } - } - } - - void ImmutableCPU() const { - if (IsDirty() && - !IsInCPU()) { // If data has been changed in CUDA, or CPU has no data. - CopyToCPU(); - UnsetFlag(kDirty); - } - SetFlag(kDataInCPU); - } - - void UnsetFlag(int flag) const { flag_ &= ~flag; } - void SetFlag(int flag) const { flag_ |= flag; } + const void *Handle() const { return &m_.Data(); } - bool IsDirty() const { return flag_ & kDirty; } - - bool IsInCUDA() const { return flag_ & kDataInCUDA; } - - bool IsInCPU() const { return flag_ & kDataInCPU; } - - static void WaitPlace(const platform::Place place) { - if (platform::is_gpu_place(place)) { - platform::DeviceContextPool::Instance() - .Get(boost::get(place)) - ->Wait(); - } - } - - static T &EmptyDummy() { - static T dummy = T(); - return dummy; - } - - mutable int flag_; - mutable Tensor cpu_vec_; - mutable Tensor cuda_vec_; - size_t size_; + private: + // Vector is an COW object. + mutable details::COWPtr m_; }; #else // PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f5a54c0f48c98512dcb393d63a61f3c927e7ac1f..720d17a654bf96ca2bad43cc0c4374b2303ac233 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,21 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" - #include #include #include +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_viz_pass.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" #endif #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" -#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -35,80 +33,6 @@ limitations under the License. */ namespace paddle { namespace framework { -std::unique_ptr ApplyParallelExecutorPass( - const ProgramDesc &main_program, const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶m_names, - const std::vector &local_scopes, const bool use_cuda, -#ifdef PADDLE_WITH_CUDA - const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) { -#else - const BuildStrategy &strategy) { -#endif - // Convert the program to graph. - std::unique_ptr graph(new ir::Graph(main_program)); - - // Apply a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } - - // Apply op fusion. - if (strategy.fuse_elewise_add_act_ops_) { - auto fuse_elewise_add_act_pass = - ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass"); - graph = fuse_elewise_add_act_pass->Apply(std::move(graph)); - // Apply a graph viz pass to record a graph. - if (!strategy.debug_graphviz_path_.empty()) { - auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); - const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); - viz_pass->Set("graph_viz_path", new std::string(graph_path)); - graph = viz_pass->Apply(std::move(graph)); - } - } - - // Convert graph to run on multi-devices. - auto multi_devices_pass = - ir::PassRegistry::Instance().Get("multi_devices_pass"); - multi_devices_pass->SetNotOwned>("places", - &places); - multi_devices_pass->SetNotOwned("loss_var_name", - &loss_var_name); - multi_devices_pass->SetNotOwned>( - "params", ¶m_names); - multi_devices_pass->SetNotOwned>("local_scopes", - &local_scopes); - multi_devices_pass->SetNotOwned("strategy", &strategy); - -#ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; - multi_devices_pass->SetNotOwned("nccl_ctxs", nctx); -#endif - graph = multi_devices_pass->Apply(std::move(graph)); - - // Apply a graph print pass to record a graph with device info. - if (!strategy.debug_graphviz_path_.empty()) { - auto multi_devices_print_pass = - ir::PassRegistry::Instance().Get("multi_devices_print_pass"); - multi_devices_print_pass->SetNotOwned( - "debug_graphviz_path", &strategy.debug_graphviz_path_); - multi_devices_print_pass->Set( - "graph_printer", new details::GraphvizSSAGraphPrinter); - graph = multi_devices_print_pass->Apply(std::move(graph)); - } - - // Verify that the graph is correct for multi-device executor. - auto multi_devices_check_pass = - ir::PassRegistry::Instance().Get("multi_devices_check_pass"); - graph = multi_devices_check_pass->Apply(std::move(graph)); - return graph; -} - class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) @@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor( // Step 3. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp #ifdef PADDLE_WITH_CUDA - std::unique_ptr graph = ApplyParallelExecutorPass( + std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, build_strategy, - member_->nccl_ctxs_.get()); + member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { @@ -228,11 +151,17 @@ ParallelExecutor::ParallelExecutor( } } #else - std::unique_ptr graph = ApplyParallelExecutorPass( - main_program, member_->places_, loss_var_name, params, - member_->local_scopes_, member_->use_cuda_, build_strategy); + std::unique_ptr graph = + build_strategy.Apply(main_program, member_->places_, loss_var_name, + params, member_->local_scopes_, member_->use_cuda_); #endif + // If the loss_var_name is given, the number of graph should be only one. + if (loss_var_name.size()) { + PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1, + "The number of graph should be only one"); + } + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); @@ -373,12 +302,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle - -USE_PASS(fuse_elewise_add_act_pass); -USE_PASS(graph_viz_pass); -USE_PASS(multi_devices_pass); -USE_PASS(multi_devices_check_pass); -USE_PASS(multi_devices_print_pass); #ifdef PADDLE_WITH_CUDA USE_PASS(reference_count_pass); #endif diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index c64906ff230df5f2b7cc9f5c6b29d68956ab8f33..fd386a5987f11ff64964e95eb7e9b83572dc790c 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,14 +14,14 @@ limitations under the License. */ #pragma once -#include #include #include #include #include #include + +#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" -#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index dd1ab85fd8d0c8170afcd9dd2a49ee55c41dc8be..dd5d138a1e979826d59c4731920379b030e3b492 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -76,8 +76,8 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto ap_type = GetAPType(ctx.Attr("ap_type")); int class_num = ctx.Attr("class_num"); - auto label_lod = in_label->lod(); - auto detect_lod = in_detect->lod(); + auto& label_lod = in_label->lod(); + auto& detect_lod = in_detect->lod(); PADDLE_ENFORCE_EQ(label_lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ(label_lod[0].size(), detect_lod[0].size(), @@ -166,11 +166,11 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto labels = framework::EigenTensor::From(input_label); auto detect = framework::EigenTensor::From(input_detect); - auto label_lod = input_label.lod(); - auto detect_lod = input_detect.lod(); + auto& label_lod = input_label.lod(); + auto& detect_lod = input_detect.lod(); int batch_size = label_lod[0].size() - 1; - auto label_index = label_lod[0]; + auto& label_index = label_lod[0]; for (int n = 0; n < batch_size; ++n) { std::map> boxes; @@ -274,7 +274,6 @@ class DetectionMAPOpKernel : public framework::OpKernel { output_true_pos->set_lod(true_pos_lod); output_false_pos->set_lod(false_pos_lod); - return; } void GetInputPos(const framework::Tensor& input_pos_count, @@ -292,7 +291,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { auto SetData = [](const framework::LoDTensor& pos_tensor, std::map>>& pos) { const T* pos_data = pos_tensor.data(); - auto pos_data_lod = pos_tensor.lod()[0]; + auto& pos_data_lod = pos_tensor.lod()[0]; for (size_t i = 0; i < pos_data_lod.size() - 1; ++i) { for (size_t j = pos_data_lod[i]; j < pos_data_lod[i + 1]; ++j) { T score = pos_data[j * 2]; @@ -317,20 +316,23 @@ class DetectionMAPOpKernel : public framework::OpKernel { std::map>>* false_pos) const { int batch_size = gt_boxes.size(); for (int n = 0; n < batch_size; ++n) { - auto image_gt_boxes = gt_boxes[n]; - for (auto it = image_gt_boxes.begin(); it != image_gt_boxes.end(); ++it) { + auto& image_gt_boxes = gt_boxes[n]; + for (auto& image_gt_box : image_gt_boxes) { size_t count = 0; - auto labeled_bboxes = it->second; + auto& labeled_bboxes = image_gt_box.second; if (evaluate_difficult) { count = labeled_bboxes.size(); } else { - for (size_t i = 0; i < labeled_bboxes.size(); ++i) - if (!(labeled_bboxes[i].is_difficult)) ++count; + for (auto& box : labeled_bboxes) { + if (!box.is_difficult) { + ++count; + } + } } if (count == 0) { continue; } - int label = it->first; + int label = image_gt_box.first; if (label_pos_count->find(label) == label_pos_count->end()) { (*label_pos_count)[label] = count; } else { diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc index 9a297d03cfb041e584159a5fc5ba214f8ac404b4..3acae3bcdf4a509ab6e7e19f21c4b2ec4d72b7d7 100644 --- a/paddle/fluid/operators/extract_rows_op.cc +++ b/paddle/fluid/operators/extract_rows_op.cc @@ -50,7 +50,7 @@ class ExtractRowsOp : public framework::OperatorBase { auto &in = scope.FindVar(Input("X"))->Get(); auto out = scope.FindVar(Output("Out"))->GetMutable(); - auto in_rows = in.rows(); + auto &in_rows = in.rows(); auto out_dim = framework::make_ddim( std::vector{static_cast(in_rows.size()), 1}); auto dst_ptr = out->mutable_data(out_dim, in.place()); diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 74823dab09cac358f647c074ac2f2ee2fed17e55..abd5dce8f7e7146a1671a387328c177e5e6e0a85 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -127,10 +127,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto gpu_place = boost::get(context.GetPlace()); // TODO(yuyang18): Strange code here. - memory::Copy(platform::CPUPlace(), - new_rows.CUDAMutableData(context.GetPlace()), gpu_place, - ids_data, ids_num * sizeof(int64_t), stream); - + memory::Copy(gpu_place, new_rows.CUDAMutableData(context.GetPlace()), + gpu_place, ids_data, ids_num * sizeof(int64_t), stream); d_table->set_rows(new_rows); auto *d_table_value = d_table->mutable_value(); diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu index b27880c232a51d32777569cf9ac67656ce02f232..ba8eccf82042b679f69a32f9d053f05ac8fb9a99 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cu +++ b/paddle/fluid/operators/math/selected_rows_functor.cu @@ -60,11 +60,9 @@ struct SelectedRowsAdd { auto out_place = context.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(out_place)); - memory::Copy( - boost::get(out_place), out_data, - boost::get(in1_place), in1_data, - in1_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(out_place), out_data, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); auto* in2_data = in2_value.data(); memory::Copy(boost::get(out_place), @@ -148,7 +146,7 @@ struct SelectedRowsAddTo { auto in1_height = input1.height(); PADDLE_ENFORCE_EQ(in1_height, input2->height()); - framework::Vector in1_rows(input1.rows()); + auto& in1_rows = input1.rows(); auto& in2_rows = *(input2->mutable_rows()); auto& in1_value = input1.value(); diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc index 724463c95c4a29fb5c00fe791b389d3908771640..a4f41a170426a4650fd3bf8f7fec4758ff34e1b9 100644 --- a/paddle/fluid/operators/sampling_id_op.cc +++ b/paddle/fluid/operators/sampling_id_op.cc @@ -53,15 +53,16 @@ class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker { SamplingId Operator. A layer for sampling id from multinomial distribution from the input. Sampling one id for one sample.)DOC"); - AddAttr("min", "Minimum value of random. [default 0.0].") + AddAttr("min", "Minimum value of random. (float, default 0.0).") .SetDefault(0.0f); - AddAttr("max", "Maximun value of random. [default 1.0].") + AddAttr("max", "Maximun value of random. (float, default 1.0).") .SetDefault(1.0f); - AddAttr("seed", - "Random seed used for the random number engine. " - "0 means use a seed generated by the system." - "Note that if seed is not 0, this operator will always " - "generate the same random numbers every time. [default 0].") + AddAttr( + "seed", + "Random seed used for the random number engine. " + "0 means use a seed generated by the system." + "Note that if seed is not 0, this operator will always " + "generate the same random numbers every time. (int, default 0).") .SetDefault(0); } }; diff --git a/paddle/fluid/operators/sgd_op.cu b/paddle/fluid/operators/sgd_op.cu index 4722be7a666d3e8f3c25c9499f88ddda835f60e3..243609075713305a90dc162991166ba24d54e835 100644 --- a/paddle/fluid/operators/sgd_op.cu +++ b/paddle/fluid/operators/sgd_op.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU +#include #include "paddle/fluid/operators/sgd_op.h" #include "paddle/fluid/platform/cuda_primitives.h" @@ -33,22 +33,21 @@ __global__ void SGDKernel(const T* g, const T* p, const T* learning_rate, } } -template +template __global__ void SparseSGDFunctorKernel(const T* selected_rows, const int64_t* rows, const T* learning_rate, T* tensor_out, - int64_t row_numel) { - const int ty = blockIdx.y; - int tid = threadIdx.x; - - selected_rows += ty * row_numel; - tensor_out += rows[ty] * row_numel; - - for (int index = tid; index < row_numel; index += block_size) { - // Since index in rows of SelectedRows can be duplicate, we have to use - // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd( - tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]); + int64_t row_numel, int64_t limit) { + for (int64_t i = blockIdx.x; i < limit; i += gridDim.x) { + const T* selected_rows_ptr = selected_rows + i * row_numel; + T* tensor_out_ptr = tensor_out + rows[i] * row_numel; + for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { + // Since index in rows of SelectedRows can be duplicate, we have to use + // Atomic Operation to avoid concurrent write error. + paddle::platform::CudaAtomicAdd( + tensor_out_ptr + index, + -1.0 * learning_rate[0] * selected_rows_ptr[index]); + } } } } // namespace @@ -89,7 +88,7 @@ class SGDOpCUDAKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(in_height, out_dims[0]); auto& in_value = grad->value(); - framework::Vector in_rows(grad->rows()); + auto& in_rows = grad->rows(); int64_t in_row_numel = in_value.numel() / in_rows.size(); PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height); @@ -97,13 +96,15 @@ class SGDOpCUDAKernel : public framework::OpKernel { auto* in_data = in_value.data(); auto* out_data = param_out->data(); - const int block_size = 256; - dim3 threads(block_size, 1); - dim3 grid(1, in_rows.size()); - SparseSGDFunctorKernel< - T, 256><<>>( + const int kThreadsPerBlock = 256; + int thread_x = kThreadsPerBlock; + int max_threads = ctx.cuda_device_context().GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + SparseSGDFunctorKernel<<>>( in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data(), - out_data, in_row_numel); + out_data, in_row_numel, in_rows.size()); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 7c61e38f6222886a49a3de47867f26aeb6273a6b..34403c7a7aa717cca470be2931009e219e00e3ae 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -124,7 +124,6 @@ class SumKernel : public framework::OpKernel { out_value->Resize(framework::make_ddim(in_dim)); out_value->mutable_data(context.GetPlace()); - // if all the input sparse vars are empty, no need to // merge these vars. if (first_dim == 0UL) { diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b5bd07d401f9ebfe441bc0f84f9bad317f0e8da9..e7f634c4a622b48e97040987836406cf73cb23b6 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) if(NOT WIN32) list(APPEND PYBIND_DEPS parallel_executor profiler) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 8b62502e3f920a3bf7d80f9e7edc2f3647a0e5b1..ef2f1f2a20a2eddee8ac077ee4bbf4dfd777448d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" @@ -595,6 +596,29 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); + py::class_> pass(m, "Pass"); + pass.def(py::init()) + .def("set_str", [](ir::Pass &self, const std::string &name, + const std::string &attr) { + self.Set(name, new std::string(attr)); + }); + + py::class_> pb( + m, "PassBuilder"); + pb.def(py::init()) + .def("append_pass", + [](ir::PassBuilder &self, + const std::string &pass_type) -> std::shared_ptr { + return self.AppendPass(pass_type); + }) + .def("all_passes", [](ir::PassBuilder &self) { return self.AllPasses(); }) + .def("insert_pass", + [](ir::PassBuilder &self, size_t idx, const std::string &pass_type) { + return self.InsertPass(idx, pass_type); + }) + .def("remove_pass", + [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); + // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy"); @@ -677,7 +701,11 @@ All parameter, weight, gradient are variables in Paddle. }, [](BuildStrategy &self, bool b) { self.fuse_elewise_add_act_ops_ = b; - }); + }) + .def("_create_passes_from_strategy", + [](BuildStrategy &self) -> std::shared_ptr { + return self.CreatePassesFromStrategy(); + }); pe.def(py::init &, const std::unordered_set &, diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index c397f070e947ba787c13397dfc07e4b1e4e37e73..7d2fb7c6ce9e6a89df2c777323fc6a547fc227f4 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -70,8 +70,8 @@ function cmake_gen() { PYTHON_FLAGS="" SYSTEM=`uname -s` if [ "$SYSTEM" == "Darwin" ]; then + echo "Using python abi: $1" if [[ "$1" == "cp27-cp27m" ]] || [[ "$1" == "" ]]; then - echo "using python abi: $1" if [ -d "/Library/Frameworks/Python.framework/Versions/2.7" ]; then export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/2.7 @@ -82,7 +82,18 @@ function cmake_gen() { else exit 1 fi - # TODO: qiyang add python3 part here + elif [ "$1" == "cp35-cp35m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.5" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi fi else if [ "$1" != "" ]; then @@ -731,6 +742,10 @@ function main() { build_mac run_mac_test ${PROC_RUN:-1} ;; + macbuild) + cmake_gen ${PYTHON_ABI:-""} + build_mac + ;; cicheck_py35) cmake_gen ${PYTHON_ABI:-""} build diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 574d0d727cba9fa9de0cffbe116f71b9e65a7092..9772c65738a2c5373f657164e3bc379404ba642e 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -284,7 +284,7 @@ def detection_output(loc, target_box=loc, code_type='decode_center_size') compile_shape = scores.shape - run_shape = ops.shape(scores) + run_shape = nn.shape(scores) scores = nn.flatten(x=scores, axis=2) scores = nn.softmax(input=scores) scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape) @@ -697,7 +697,7 @@ def ssd_loss(location, raise ValueError("Only support mining_type == max_negative now.") num, num_prior, num_class = confidence.shape - conf_shape = ops.shape(confidence) + conf_shape = nn.shape(confidence) def __reshape_to_2d(var): return nn.flatten(x=var, axis=2) @@ -724,7 +724,7 @@ def ssd_loss(location, target_label.stop_gradient = True conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) # 3. Mining hard examples - actual_shape = ops.slice(conf_shape, axes=[0], starts=[0], ends=[2]) + actual_shape = nn.slice(conf_shape, axes=[0], starts=[0], ends=[2]) actual_shape.stop_gradient = True conf_loss = nn.reshape( x=conf_loss, shape=(num, num_prior), actual_shape=actual_shape) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index d56fa76300e7054ef71a7729483a579fa35f1dac..81c78cba219007a9348af961e4b0dc227edba747 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -507,7 +507,6 @@ def py_reader(capacity, 1. The basic usage of :code:`py_reader` is as follows: - >>> import paddle.v2 >>> import paddle.fluid as fluid >>> import paddle.dataset.mnist as mnist >>> @@ -515,7 +514,7 @@ def py_reader(capacity, >>> shapes=[(-1,3,224,224), (-1,1)], >>> dtypes=['float32', 'int64']) >>> reader.decorate_paddle_reader( - >>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) + >>> paddle.reader.shuffle(paddle.batch(mnist.train()) >>> >>> img, label = fluid.layers.read_file(reader) >>> loss = network(img, label) # some network definition @@ -534,7 +533,6 @@ def py_reader(capacity, 2. When training and testing are both performed, two different :code:`py_reader` should be created with different names, e.g.: - >>> import paddle.v2 >>> import paddle.fluid as fluid >>> import paddle.dataset.mnist as mnist >>> @@ -548,7 +546,7 @@ def py_reader(capacity, >>> dtypes=['float32', 'int64'], >>> name='train_reader') >>> train_reader.decorate_paddle_reader( - >>> paddle.v2.reader.shuffle(paddle.batch(mnist.train()) + >>> paddle.reader.shuffle(paddle.batch(mnist.train()) >>> >>> test_reader = fluid.layers.py_reader(capacity=32, >>> shapes=[(-1,3,224,224), (-1,1)], diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2cb61a9cd25c744710ab7ac9ea591902740f78da..a9696ac20060d1069a99a02a79a755a740e760f0 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -29,110 +29,29 @@ from .. import unique_name from functools import reduce __all__ = [ - 'fc', - 'embedding', - 'dynamic_lstm', - 'dynamic_lstmp', - 'dynamic_gru', - 'gru_unit', - 'linear_chain_crf', - 'crf_decoding', - 'cos_sim', - 'cross_entropy', - 'square_error_cost', - 'chunk_eval', - 'sequence_conv', - 'conv2d', - 'conv3d', - 'sequence_pool', - 'sequence_softmax', - 'softmax', - 'pool2d', - 'pool3d', - 'batch_norm', - 'beam_search_decode', - 'conv2d_transpose', - 'conv3d_transpose', - 'sequence_expand', - 'sequence_expand_as', - 'sequence_pad', - 'lstm_unit', - 'reduce_sum', - 'reduce_mean', - 'reduce_max', - 'reduce_min', - 'reduce_prod', - 'sequence_first_step', - 'sequence_last_step', - 'dropout', - 'split', - 'ctc_greedy_decoder', - 'edit_distance', - 'l2_normalize', - 'matmul', - 'topk', - 'warpctc', - 'sequence_reshape', - 'transpose', - 'im2sequence', - 'nce', - 'hsigmoid', - 'beam_search', - 'row_conv', - 'multiplex', - 'layer_norm', - 'softmax_with_cross_entropy', - 'smooth_l1', - 'one_hot', - 'autoincreased_step_counter', - 'reshape', - 'squeeze', - 'unsqueeze', - 'lod_reset', - 'lrn', - 'pad', - 'pad_constant_like', - 'label_smooth', - 'roi_pool', - 'dice_loss', - 'image_resize', - 'image_resize_short', - 'resize_bilinear', - 'gather', - 'scatter', - 'sequence_scatter', - 'random_crop', - 'mean_iou', - 'relu', - 'log', - 'crop', - 'rank_loss', - 'elu', - 'relu6', - 'pow', - 'stanh', - 'hard_sigmoid', - 'swish', - 'prelu', - 'brelu', - 'leaky_relu', - 'soft_relu', - 'flatten', - 'sequence_mask', - 'stack', - 'pad2d', - 'unstack', - 'sequence_enumerate', - 'expand', - 'sequence_concat', - 'scale', - 'elementwise_add', - 'elementwise_div', - 'elementwise_sub', - 'elementwise_mul', - 'elementwise_max', - 'elementwise_min', - 'elementwise_pow', + 'fc', 'embedding', 'dynamic_lstm', 'dynamic_lstmp', 'dynamic_gru', + 'gru_unit', 'linear_chain_crf', 'crf_decoding', 'cos_sim', 'cross_entropy', + 'square_error_cost', 'chunk_eval', 'sequence_conv', 'conv2d', 'conv3d', + 'sequence_pool', 'sequence_softmax', 'softmax', 'pool2d', 'pool3d', + 'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'conv3d_transpose', + 'sequence_expand', 'sequence_expand_as', 'sequence_pad', 'lstm_unit', + 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min', 'reduce_prod', + 'sequence_first_step', 'sequence_last_step', 'dropout', 'split', + 'ctc_greedy_decoder', 'edit_distance', 'l2_normalize', 'matmul', 'topk', + 'warpctc', 'sequence_reshape', 'transpose', 'im2sequence', 'nce', + 'hsigmoid', 'beam_search', 'row_conv', 'multiplex', 'layer_norm', + 'softmax_with_cross_entropy', 'smooth_l1', 'one_hot', + 'autoincreased_step_counter', 'reshape', 'squeeze', 'unsqueeze', + 'lod_reset', 'lrn', 'pad', 'pad_constant_like', 'label_smooth', 'roi_pool', + 'dice_loss', 'image_resize', 'image_resize_short', 'resize_bilinear', + 'gather', 'scatter', 'sequence_scatter', 'random_crop', 'mean_iou', 'relu', + 'log', 'crop', 'rank_loss', 'elu', 'relu6', 'pow', 'stanh', 'hard_sigmoid', + 'swish', 'prelu', 'brelu', 'leaky_relu', 'soft_relu', 'flatten', + 'sequence_mask', 'stack', 'pad2d', 'unstack', 'sequence_enumerate', + 'expand', 'sequence_concat', 'scale', 'elementwise_add', 'elementwise_div', + 'elementwise_sub', 'elementwise_mul', 'elementwise_max', 'elementwise_min', + 'elementwise_pow', 'uniform_random_batch_size_like', 'gaussian_random', + 'sampling_id', 'gaussian_random_batch_size_like', 'sum', 'slice', 'shape' ] @@ -6463,6 +6382,246 @@ def expand(x, expand_times, name=None): return out +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + + +@templatedoc() +def uniform_random_batch_size_like(input, + shape, + dtype='float32', + input_dim_idx=0, + output_dim_idx=0, + min=-1.0, + max=1.0, + seed=0): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + shape (tuple|list): ${shape_comment} + input_dim_idx (Int): ${input_dim_idx_comment} + output_dim_idx (Int): ${output_dim_idx_comment} + min (Float): ${min_comment} + max (Float): ${max_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of data : float32, float_16, int etc + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('uniform_random_batch_size_like', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='uniform_random_batch_size_like', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'input_dim_idx': input_dim_idx, + 'output_dim_idx': output_dim_idx, + 'min': min, + 'max': max, + 'seed': seed, + 'dtype': c_dtype + }) + + return out + + +@templatedoc() +def gaussian_random(shape, + mean=0.0, + std=1.0, + seed=0, + dtype='float32', + use_mkldnn=False): + """ + ${comment} + + Args: + shape (tuple|list): ${shape_comment} + mean (Float): ${mean_comment} + std (Float): ${std_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): Output data type. + use_mkldnn (Bool): Only used in mkldnn kernel. + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('gaussian_random', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='gaussian_random', + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'mean': mean, + 'std': std, + 'seed': seed, + 'dtype': c_dtype, + 'use_mkldnn': use_mkldnn + }) + + return out + + +@templatedoc() +def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + min (Float): ${min_comment} + max (Float): ${max_comment} + seed (Float): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('sampling_id', **locals()) + out = helper.create_tmp_variable(dtype) + helper.append_op( + type='sampling_id', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'min': min, + 'max': max, + 'seed': seed}) + + return out + + +@templatedoc() +def gaussian_random_batch_size_like(input, + shape, + input_dim_idx=0, + output_dim_idx=0, + mean=0.0, + std=1.0, + seed=0, + dtype='float32'): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + shape (tuple|list): ${shape_comment} + input_dim_idx (Int): ${input_dim_idx_comment} + output_dim_idx (Int): ${output_dim_idx_comment} + mean (Float): ${mean_comment} + std (Float): ${std_comment} + seed (Int): ${seed_comment} + dtype(np.dtype|core.VarDesc.VarType|str): The type of output data : float32, float_16, int etc + + Returns: + out (Variable): ${out_comment} + """ + + helper = LayerHelper('gaussian_random_batch_size_like', **locals()) + out = helper.create_tmp_variable(dtype) + c_dtype = convert_np_dtype_to_dtype_(dtype) + helper.append_op( + type='gaussian_random_batch_size_like', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={ + 'shape': shape, + 'input_dim_idx': input_dim_idx, + 'output_dim_idx': output_dim_idx, + 'mean': mean, + 'std': std, + 'seed': seed, + 'dtype': c_dtype + }) + + return out + + +@templatedoc() +def sum(x, use_mkldnn=False): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + use_mkldnn (Bool): ${use_mkldnn_comment} + + Returns: + out (Variable): ${out_comment} + """ + + helper = LayerHelper('sum', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('x')) + helper.append_op( + type='sum', + inputs={'X': x}, + outputs={'Out': out}, + attrs={'use_mkldnn': use_mkldnn}) + + return out + + +@templatedoc() +def slice(input, axes, starts, ends): + """ + ${comment} + + Args: + input (Variable): ${input_comment}. + axes (List): ${axes_comment} + starts (List): ${starts_comment} + ends (List): ${ends_comment} + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('slice', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) + helper.append_op( + type='slice', + inputs={'Input': input}, + outputs={'Out': out}, + attrs={'axes': axes, + 'starts': starts, + 'ends': ends}) + + return out + + +@templatedoc() +def shape(input): + """ + ${comment} + + Args: + input (Variable): ${input_comment} + + Returns: + out (Variable): ${out_comment} + + """ + + helper = LayerHelper('shape', **locals()) + out = helper.create_tmp_variable(dtype=helper.input_dtype('input')) + helper.append_op( + type='shape', inputs={'Input': input}, outputs={'Out': out}) + + return out + + def _elementwise_op(helper): op_type = helper.layer_type x = helper.kwargs.get('x', None) diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 363e9200bedfd9cc843513c2db16eadb5ffa33b4..220d065f8f1cc02508dea2679820e1f7f490866d 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -45,13 +45,6 @@ __all__ = [ 'logical_or', 'logical_xor', 'logical_not', - 'uniform_random_batch_size_like', - 'gaussian_random', - 'sampling_id', - 'gaussian_random_batch_size_like', - 'sum', - 'slice', - 'shape', 'maxout', ] diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b5549c507ed753f4504afd655be59b444164e6f3..e97643cddef22465436051a41ef4b825e9634d23 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -345,7 +345,7 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + - str(actual_t)) + str(actual_t) + " in class " + self.__class__.__name__) if isinstance(expect, tuple): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + diff --git a/python/paddle/fluid/tests/unittests/test_detection_map_op.py b/python/paddle/fluid/tests/unittests/test_detection_map_op.py index f6eb8f2c6d8b94f92e24ff789c91efb53a645a46..0c5343a97d5ef0f97fc6b144dfc82174eacb8573 100644 --- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py +++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py @@ -20,6 +20,7 @@ import six import sys import collections import math +import paddle.fluid as fluid from op_test import OpTest @@ -32,7 +33,7 @@ class TestDetectionMAPOp(OpTest): self.detect = np.array(self.detect).astype('float32') self.mAP = np.array(self.mAP).astype('float32') - if (len(self.class_pos_count) > 0): + if len(self.class_pos_count) > 0: self.class_pos_count = np.array(self.class_pos_count).astype( 'int32') self.true_pos = np.array(self.true_pos).astype('float32') @@ -273,7 +274,7 @@ class TestDetectionMAPOp11Point(TestDetectionMAPOp): class TestDetectionMAPOpMultiBatch(TestDetectionMAPOp): def init_test_case(self): super(TestDetectionMAPOpMultiBatch, self).init_test_case() - self.class_pos_count = [0, 2, 1] + self.class_pos_count = [0, 2, 1, 0] self.true_pos_lod = [[0, 3, 2]] self.true_pos = [[0.7, 1.], [0.3, 0.], [0.2, 1.], [0.8, 0.], [0.1, 1.]] self.false_pos_lod = [[0, 3, 2]] diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f474cdae2054531d44724e0e3e0e58a35fb8ddcd..b8dc9e8ad7cd7cd100d5c3cb99319e6f5a37da91 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -541,7 +541,7 @@ class TestBook(unittest.TestCase): with program_guard(program): input = layers.data( name="input", shape=[3, 100, 100], dtype="float32") - out = layers.shape(input, name="shape") + out = layers.shape(input) self.assertIsNotNone(out) print(str(program)) @@ -758,6 +758,65 @@ class TestBook(unittest.TestCase): out = layers.expand(x, [1, 2]) print(str(program)) + def test_uniform_random_batch_size_like(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + out = layers.uniform_random_batch_size_like(input, [-1, 11]) + self.assertIsNotNone(out) + print(str(program)) + + def test_gaussian_random(self): + program = Program() + with program_guard(program): + out = layers.gaussian_random(shape=[20, 30]) + self.assertIsNotNone(out) + print(str(program)) + + def test_sampling_id(self): + program = Program() + with program_guard(program): + x = layers.data( + name="X", + shape=[13, 11], + dtype='float32', + append_batch_size=False) + + out = layers.sampling_id(x) + self.assertIsNotNone(out) + print(str(program)) + + def test_gaussian_random_batch_size_like(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + + out = layers.gaussian_random_batch_size_like( + input, shape=[-1, 11], mean=1.0, std=2.0) + self.assertIsNotNone(out) + print(str(program)) + + def test_sum(self): + program = Program() + with program_guard(program): + input = layers.data(name="input", shape=[13, 11], dtype='float32') + + out = layers.sum(input) + self.assertIsNotNone(out) + print(str(program)) + + def test_slice(self): + starts = [1, 0, 2] + ends = [3, 3, 4] + axes = [0, 1, 2] + + program = Program() + with program_guard(program): + input = layers.data( + name="input", shape=[3, 4, 5, 6], dtype='float32') + + out = layers.slice(input, axes=axes, starts=starts, ends=ends) + def test_softshrink(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..288c5f6a1f6b1760ca40c0c653e4c0726b799519 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py @@ -0,0 +1,121 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import unittest +import os +import sys +import math + + +def simple_fc_net(): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestPassBuilder(unittest.TestCase): + def check_network_convergence(self, use_cuda, build_strategy=None): + os.environ['CPU_NUM'] = str(4) + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = simple_fc_net() + test_program = main.clone(for_test=True) + + opt = fluid.optimizer.SGD(learning_rate=0.001) + opt.minimize(loss) + + batch_size = 32 + image = np.random.normal(size=(batch_size, 784)).astype('float32') + label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + feed_dict = {'image': image, 'label': label} + + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, + loss_name=loss.name, + main_program=main, + build_strategy=build_strategy) + + test_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, + main_program=test_program, + share_vars_from=train_exe, + build_strategy=build_strategy) + + for i in range(5): + test_loss, = test_exe.run([loss.name], feed=feed_dict) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + + avg_test_loss_val = np.array(test_loss).mean() + if math.isnan(float(avg_test_loss_val)): + sys.exit("got NaN loss, testing failed.") + + avg_train_loss_val = np.array(train_loss).mean() + if math.isnan(float(avg_train_loss_val)): + sys.exit("got NaN loss, training failed.") + + self.assertTrue( + np.allclose( + train_loss, test_loss, atol=1e-8), + "Train loss: " + str(train_loss) + "\n Test loss:" + + str(test_loss)) + + def test_parallel_testing_with_new_strategy(self): + build_strategy = fluid.BuildStrategy() + pass_builder = build_strategy._create_passes_from_strategy() + origin_len = len(pass_builder.all_passes()) + + viz_pass = pass_builder.append_pass("graph_viz_pass") + self.assertEqual(origin_len + 1, len(pass_builder.all_passes())) + + pass_builder.insert_pass( + len(pass_builder.all_passes()), "graph_viz_pass") + self.assertEqual(origin_len + 2, len(pass_builder.all_passes())) + + pass_builder.remove_pass(len(pass_builder.all_passes()) - 1) + self.assertEqual(origin_len + 1, len(pass_builder.all_passes())) + viz_pass.set_str("graph_viz_path", "/tmp/test_viz_pass") + + self.check_network_convergence( + use_cuda=core.is_compiled_with_cuda(), + build_strategy=build_strategy) + try: + os.stat("/tmp/test_viz_pass") + except os.error: + self.assertFalse(True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py index ab7a18d4c5c4ce1e490e2951ff9fbb023324e753..143d187edc3a154418f9e639b7d492c8ce994d42 100644 --- a/python/paddle/fluid/tests/unittests/transformer_model.py +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -246,6 +246,7 @@ def prepare_encoder(src_word, padding_idx=pos_pad_idx, param_attr=fluid.ParamAttr( name=pos_enc_param_name, trainable=False)) + src_pos_enc.stop_gradient = True enc_input = src_word_emb + src_pos_enc # FIXME(guosheng): Decouple the program desc with batch_size.