diff --git a/README.md b/README.md index eb99ed21d02650ef16cc7da91836909c02895be9..a67cb8ad439f462c361cb6bac2449c3a4b042126 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,21 @@ learning to many products at Baidu. Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Lastest PaddlePaddle Version: [Fluid](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid) + +### Latest PaddlePaddle Release: [Fluid 0.14.0](https://github.com/PaddlePaddle/Paddle/tree/v0.14.0) +### Install Latest Stable Release: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==0.14.0.post87 +# Linux GPU cuda8cudnn5 +pip install paddlepaddle-gpu==0.14.0.post85 + +# For installation on other platform, refer to http://paddlepaddle.org/ +``` ## Features diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md new file mode 100644 index 0000000000000000000000000000000000000000..a141dcbca584c6064c8da863410692a8be911d12 --- /dev/null +++ b/doc/fluid/design/ir/draft.md @@ -0,0 +1,89 @@ +## Motivation + +There is a ```gap``` between the ```Program``` defined by +user and the ```Executable``` that can be scheduled +efficiently on heterogeneous hardware, either locally +or distributedly. + +Usually, the ```gap``` is bridged by + +* A serious transformations with defined order. + +* These transformations usually involve +```insert, delete, clustering, split, dependency analysis```. + +* Has a simple way to verify and debug each transformation. + +* Flexible to add, remove or customize transformations to fit +the requirements of various algorithms (models) and hardware secenarios. + +Some other events also push us to a better unified pattern. + +* The deep learning framework is built around the concepts of graphs. +To leverage tools such as compilation (e.g. TVM and nGraph) or +cross-framework conversion (e.g. ONNX), we also need a intermediate +representation that can be connected to the rest of the ecosystem. + + +We need a unified pattern to naturally support the requirements +described above. The pattern should fit both training, inference +and other offline serielized model transformations. +Learned from LLVM and other deep learning framework, we draft the +design below. + + +## Design + +### Major Concepts + +#### Node + +```Node``` represents an operation that performs some computation or +a variable that is input or output of operation. + +```Node```s are connected to other ```Node```s via inputs and outputs. + +Other properties (maybe device placement information) can be added +to ```Node``` in the future if it's a +common requirement of many other ```Pass```es. Otherwise, it should live +in a ```Node``` wrapper class that is private to some ```Pass``` or be +a local member of a ```Pass```. + +#### Graph + +```Graph``` contains a list of ```Node```s, which are connected to +each other via inputs and outputs. + +TODO: Better definitions for the graph. + +```Graph``` can also contain ```Attribute```s. ```Attribute```s +can be ``any`` thing. For example, it can be a list of "wraper" +nodes. The ```wrapper``` nodes compose ```Node```s and provide +helper method for execution or transformation. ```Attribute``` +can also contain other things that describe some properties of +the ```Graph``` or ```Graph``` nodes. ```Attribute``` can be passed +across ```Pass```. However, it should be used with care. + +#### Pass + +```Pass``` represents a transformation of ```Graph```. Its input +is a ```Graph``` and its output is also a ```Graph```. For example, +a ```Pass``` can simply print out the ```Graph```. A ```Pass``` +can also fuse some ```Graph```'s ```Node```s. + +#### Optimize + +```Optimize``` contains a series of ```Pass``` with defined order. +```Optimize``` transforms a ```Graph``` that only contains raw +modeling logic to a ```Graph``` that can be run efficiently while +maintaining the original modeling logic. + + +### Optimize Process + +* Program is first converted to Graph. +* Graph goes through a series of Pass +* Graph is transformed from raw model logic to a +form that is efficient to execute. + +Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 80a268aae0025cf9b72f9cb523348de957302620..45df5de532c1085c9ad8f6b26dfebbf1d7a3ad94 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -341,6 +341,26 @@ paddle.fluid.layers.polynomial_decay ArgSpec(args=['learning_rate', 'decay_steps paddle.fluid.layers.piecewise_decay ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.noam_decay ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.append_LARS ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.InitState.__init__ ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')) +paddle.fluid.contrib.StateCell.__init__ ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.StateCell.compute_state ArgSpec(args=['self', 'inputs'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.StateCell.get_input ArgSpec(args=['self', 'input_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.StateCell.get_state ArgSpec(args=['self', 'state_name'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.StateCell.out_state ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.StateCell.set_state ArgSpec(args=['self', 'state_name', 'state_value'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.StateCell.state_updater ArgSpec(args=['self', 'updater'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.StateCell.update_states ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.TrainingDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.contrib.TrainingDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.contrib.TrainingDecoder.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) +paddle.fluid.contrib.TrainingDecoder.static_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.TrainingDecoder.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.BeamSearchDecoder.__init__ ArgSpec(args=['self', 'state_cell', 'init_ids', 'init_scores', 'target_dict_dim', 'word_dim', 'input_var_dict', 'topk_size', 'sparse_emb', 'max_len', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=({}, 50, True, 100, 1, 1, None)) +paddle.fluid.contrib.BeamSearchDecoder.block ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None) +paddle.fluid.contrib.BeamSearchDecoder.decode ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.BeamSearchDecoder.early_stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.contrib.BeamSearchDecoder.read_array ArgSpec(args=['self', 'init', 'is_ids', 'is_scores'], varargs=None, keywords=None, defaults=(False, False)) +paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.transpiler.DistributeTranspiler.create_splited_vars ArgSpec(args=['self', 'source_var', 'block', 'tag'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ec252929d5584c211cea7fa52004ecdfdf586a85..de06c860f550641a58a32d49e85feb7278fed1dd 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,4 +1,5 @@ add_subdirectory(details) +add_subdirectory(ir) # ddim lib proto_library(framework_proto SRCS framework.proto) @@ -93,7 +94,7 @@ else() endif() -cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 4fb4ec38ee965a2790d11378a1ce6befa0ef5a00..e8057c35e8b957cb43e66937a5073a085c6e7708 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -5,8 +5,7 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(rpc_op_handle SRCS rpc_op_handle.cc DEPS framework_proto scope place operator op_registry) -cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) -cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) +cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS graph) cc_library(ssa_graph_printer SRCS ssa_graph_printer.cc DEPS ssa_graph_builder) cc_library(ssa_graph_checker SRCS ssa_graph_checker.cc DEPS ssa_graph_builder) @@ -35,7 +34,7 @@ cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) -cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph framework_proto) +cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index b335d3a0d364c916e19574de8d3ed89aaec7de41..700c73c745bad72637d77385f5cd38c494501c86 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -23,10 +23,14 @@ namespace framework { namespace details { #ifdef PADDLE_WITH_CUDA -AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, +AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs) - : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + nccl_ctxs_(ctxs) { if (nccl_ctxs_) { for (auto &p : places_) { this->dev_ctxes_[p] = nccl_ctxs_->DevCtx(p); @@ -34,9 +38,10 @@ AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, } } #else -AllReduceOpHandle::AllReduceOpHandle(const std::vector &local_scopes, +AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, const std::vector &places) - : local_scopes_(local_scopes), places_(places) {} + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} #endif void AllReduceOpHandle::RunImpl() { diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index fdd250b0d3eb166249271a95f7592b9fadee5265..f6ef3a1367b91b6abf8ce74a91f73056efd0f84e 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -30,11 +30,11 @@ namespace details { struct AllReduceOpHandle : public OpHandleBase { #ifdef PADDLE_WITH_CUDA - AllReduceOpHandle(const std::vector &local_scopes, + AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs); #else - AllReduceOpHandle(const std::vector &local_scopes, + AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places); #endif std::string Name() const override; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 8036f756b6d6506684c109ab881d546f38176a10..fe4e733e43417977df324fde808f52b228a27d19 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -35,10 +35,13 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { public: #ifdef PADDLE_WITH_CUDA - BroadcastOpHandle(const std::vector &local_scopes, + BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *nccl_ctxs) - : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) { + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + nccl_ctxs_(nccl_ctxs) { if (nccl_ctxs_) { for (auto &p_ctx : nccl_ctxs_->contexts_) { dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); @@ -46,9 +49,9 @@ struct BroadcastOpHandle : public OpHandleBase { } } #else - BroadcastOpHandle(const std::vector &local_scopes, + BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places) - : local_scopes_(local_scopes), places_(places) {} + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} #endif std::string Name() const override; diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index c6e923ef77ff03413eefe4f26457a5322747618e..1413f7bd9ac515ae7dceee62de8f3bc74e3a2efc 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -96,48 +96,61 @@ struct TestBroadcastOpHandle { } param_scopes_[input_scope_idx]->Var("input"); + std::unique_ptr n( + new ir::Node("node0", ir::Node::Type::kOperation)); if (use_gpu_) { #ifdef PADDLE_WITH_CUDA - op_handle_.reset( - new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); + op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_, + nccl_ctxs_.get())); #else PADDLE_THROW("CUDA is not support."); #endif } else { #ifdef PADDLE_WITH_CUDA - op_handle_.reset( - new BroadcastOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); + op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_, + nccl_ctxs_.get())); #else - op_handle_.reset(new BroadcastOpHandle(local_scopes_, gpu_list_)); + op_handle_.reset( + new BroadcastOpHandle(n.get(), local_scopes_, gpu_list_)); #endif } - auto* in_var_handle = - new VarHandle(1, input_scope_idx, "input", gpu_list_[input_scope_idx]); + std::unique_ptr v( + new ir::Node("node1", ir::Node::Type::kVariable)); + auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input", + gpu_list_[input_scope_idx]); vars_.emplace_back(in_var_handle); op_handle_->AddInput(in_var_handle); // add dummy var - vars_.emplace_back(new DummyVarHandle()); + + std::unique_ptr v2( + new ir::Node("node2", ir::Node::Type::kVariable)); + vars_.emplace_back(new DummyVarHandle(v2.get())); DummyVarHandle* dummy_var_handle = static_cast(vars_.back().get()); - dummy_var_handle->generated_op_ = nullptr; + dummy_var_handle->ClearGeneratedOp(); op_handle_->AddInput(dummy_var_handle); for (size_t j = 0; j < gpu_list_.size(); ++j) { if (!use_gpu_) { op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); } - VarHandle* out_var_handle = new VarHandle(2, j, "out", gpu_list_[j]); + std::unique_ptr v3( + new ir::Node("node3", ir::Node::Type::kVariable)); + VarHandle* out_var_handle = + new VarHandle(v3.get(), 2, j, "out", gpu_list_[j]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); } // add dummy var - vars_.emplace_back(new DummyVarHandle()); + std::unique_ptr v4( + new ir::Node("node4", ir::Node::Type::kVariable)); + vars_.emplace_back(new DummyVarHandle(v4.get())); DummyVarHandle* out_dummy_var_handle = static_cast(vars_.back().get()); - out_dummy_var_handle->generated_op_ = nullptr; + out_dummy_var_handle->ClearGeneratedOp(); op_handle_->AddOutput(out_dummy_var_handle); } diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index df05bb06333d6b964f2f5434c3d43214e5d2cb7a..b6282debdb4eb6b1f29c39e54ac4f3e2296838da 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -19,9 +19,10 @@ namespace paddle { namespace framework { namespace details { -ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope, +ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place) - : op_(framework::OpRegistry::CreateOp(op_desc)), + : OpHandleBase(node), + op_(framework::OpRegistry::CreateOp(*node->Op())), scope_(scope), place_(place) {} @@ -35,8 +36,8 @@ void ComputationOpHandle::RunImpl() { bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) { bool need_wait = - in_var && in_var->generated_op_ && - in_var->generated_op_->DeviceContext(place_) != dev_ctxes_[place_]; + in_var && in_var->GeneratedOp() && + in_var->GeneratedOp()->DeviceContext(place_) != dev_ctxes_[place_]; return need_wait; } diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index f048f973fdeb6cf7d1485cda8cea7d530d9ba465..d9fcd92427ef38b131b4ce782c0ada37765682db 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -28,8 +28,7 @@ namespace framework { namespace details { struct ComputationOpHandle : public OpHandleBase { public: - ComputationOpHandle(const OpDesc &op_desc, Scope *scope, - platform::Place place); + ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place); std::string Name() const override; diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 68896c8ac1bae7d4bfcfa79cc8ec5c26bf2d93ee..525d24322442ef4dd6e8c24212af61c908959b87 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -22,10 +22,10 @@ namespace details { #ifdef PADDLE_WITH_CUDA DataBalanceOpHandle::DataBalanceOpHandle( - const std::vector &local_scopes, + ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs) - : local_scopes_(local_scopes), places_(places) { + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) { if (ctxs) { for (auto &p : places_) { this->dev_ctxes_[p] = ctxs->DevCtx(p); @@ -34,9 +34,9 @@ DataBalanceOpHandle::DataBalanceOpHandle( } #else DataBalanceOpHandle::DataBalanceOpHandle( - const std::vector &local_scopes, + ir::Node *node, const std::vector &local_scopes, const std::vector &places) - : local_scopes_(local_scopes), places_(places) {} + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} #endif std::string DataBalanceOpHandle::Name() const { return "data balance"; } diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h index 76a407e3610e8bb48facf1f814779f4c23f92d98..0462fb6ec713eb977f420a9cb485c0273e782496 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.h +++ b/paddle/fluid/framework/details/data_balance_op_handle.h @@ -30,11 +30,11 @@ namespace details { struct DataBalanceOpHandle : public OpHandleBase { public: #ifdef PADDLE_WITH_CUDA - DataBalanceOpHandle(const std::vector &local_scopes, + DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs); #else - DataBalanceOpHandle(const std::vector &local_scopes, + DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places); #endif diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index d646c944601e81477787740189d7ac60ae97fa80..fe18b2060c5cd7e157374da53c5a985f70545ab7 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -21,13 +21,16 @@ namespace paddle { namespace framework { namespace details { -FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset, +FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, std::vector *local_scopes) - : data_(data), offset_(offset), local_scopes_(local_scopes) {} + : OpHandleBase(node), + data_(data), + offset_(offset), + local_scopes_(local_scopes) {} FetchOpHandle::~FetchOpHandle() { for (auto *input_var : inputs_) { - input_var->pending_ops_.erase(this); + input_var->RemoveOutput(this, this->Node()); } } @@ -77,8 +80,8 @@ void FetchOpHandle::RunImpl() { void FetchOpHandle::WaitInputVarGenerated(const platform::Place &place) { auto cpu_ctx = platform::DeviceContextPool::Instance().Get(place); for (auto *input : inputs_) { - if (input->generated_op_) { - input->generated_op_->RecordWaitEventOnCtx(cpu_ctx); + if (input->GeneratedOp()) { + input->GeneratedOp()->RecordWaitEventOnCtx(cpu_ctx); } } } diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h index e09bdd1d3338bb175c1ddae35b53f98197b68e9a..6ce42f92d7f1e81eeafd1eb5c28ce3564a5ffebc 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.h +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -28,7 +28,7 @@ namespace details { struct FetchOpHandle : public OpHandleBase { public: - FetchOpHandle(FeedFetchList *data, size_t offset, + FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, std::vector *local_scopes); ~FetchOpHandle(); diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h index 140fb5bb49a33146de974b6d79559b4cf15bdd7b..3f360c510a4fdc0caaeb15d862b217ef41b8ea6e 100644 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.h +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -30,10 +30,12 @@ namespace details { struct FuseVarsOpHandle : public OpHandleBase { public: - FuseVarsOpHandle(Scope *local_scope, const platform::Place &place, + FuseVarsOpHandle(ir::Node *node, Scope *local_scope, + const platform::Place &place, const std::unordered_map &inputs_numel, const std::type_index &var_type) - : local_scope_(local_scope), + : OpHandleBase(node), + local_scope_(local_scope), place_(place), inputs_numel_(inputs_numel), type_(var_type) { diff --git a/paddle/fluid/framework/details/gather_op_handle.cc b/paddle/fluid/framework/details/gather_op_handle.cc index 2be02304566cf5dbe348fa01fc4171990eafd158..9aae19fc73de4387186da47c55710c94d53f1b88 100644 --- a/paddle/fluid/framework/details/gather_op_handle.cc +++ b/paddle/fluid/framework/details/gather_op_handle.cc @@ -20,9 +20,10 @@ namespace paddle { namespace framework { namespace details { -GatherOpHandle::GatherOpHandle(const std::vector &local_scopes, +GatherOpHandle::GatherOpHandle(ir::Node *node, + const std::vector &local_scopes, const std::vector &places) - : local_scopes_(local_scopes), places_(places) {} + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} void GatherOpHandle::RunImpl() { if (places_.size() == 1) return; diff --git a/paddle/fluid/framework/details/gather_op_handle.h b/paddle/fluid/framework/details/gather_op_handle.h index d11ef8556aa8840949ca8dc7aa176413f70b9f22..d9afbc6547e18e8886c414ff150e332cfaf9b0c3 100644 --- a/paddle/fluid/framework/details/gather_op_handle.h +++ b/paddle/fluid/framework/details/gather_op_handle.h @@ -30,7 +30,7 @@ namespace details { struct GatherOpHandle : public OpHandleBase { public: - GatherOpHandle(const std::vector &local_scopes, + GatherOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places); std::string Name() const override; diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 3cce2cc1640b3866130126424ff8fef18b8befc6..c9b94d1e1039df6ff27f9ffe225b2a50c35a5c50 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -70,6 +70,7 @@ struct TestGatherOpHandle { } void InitGatherOp(size_t input_scope_idx) { + std::vector> nodes; for (size_t j = 0; j < gpu_list_.size(); ++j) { local_scopes_.push_back(&(g_scope_.NewScope())); Scope& local_scope = local_scopes_.back()->NewScope(); @@ -81,30 +82,37 @@ struct TestGatherOpHandle { } param_scopes_[input_scope_idx]->Var("out"); - op_handle_.reset(new GatherOpHandle(local_scopes_, gpu_list_)); + nodes.emplace_back(new ir::Node("node", ir::Node::Type::kOperation)); + op_handle_.reset( + new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_)); // add input for (size_t j = 0; j < gpu_list_.size(); ++j) { op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); - auto* in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]); + nodes.emplace_back(new ir::Node("node1", ir::Node::Type::kVariable)); + auto* in_var_handle = + new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]); vars_.emplace_back(in_var_handle); op_handle_->AddInput(in_var_handle); } // add dummy var - vars_.emplace_back(new DummyVarHandle()); + nodes.emplace_back(new ir::Node("node2", ir::Node::Type::kVariable)); + vars_.emplace_back(new DummyVarHandle(nodes.back().get())); DummyVarHandle* in_dummy_var_handle = static_cast(vars_.back().get()); - in_dummy_var_handle->generated_op_ = nullptr; + in_dummy_var_handle->ClearGeneratedOp(); op_handle_->AddInput(in_dummy_var_handle); // add output - auto* out_var_handle = - new VarHandle(2, input_scope_idx, "out", gpu_list_[input_scope_idx]); + nodes.emplace_back(new ir::Node("node3", ir::Node::Type::kVariable)); + auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx, + "out", gpu_list_[input_scope_idx]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); // add dummy var - vars_.emplace_back(new DummyVarHandle()); + nodes.emplace_back(new ir::Node("node4", ir::Node::Type::kVariable)); + vars_.emplace_back(new DummyVarHandle(nodes.back().get())); DummyVarHandle* dummy_var_handle = static_cast(vars_.back().get()); op_handle_->AddOutput(dummy_var_handle); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 6f5d4471a97cc4efc73b9df68040ab9eccde0b1c..c52980472de8d48e8c21e7c1e53813aa4847cece 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -25,6 +25,7 @@ #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/rpc_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" +#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/scope.h" @@ -66,31 +67,38 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( } } -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result, - const OpDesc &op, +void MultiDevSSAGraphBuilder::CreateOpHandleIOs(Graph *result, ir::Node *node, size_t place_id) const { auto p = places_[place_id]; - auto *op_handle = result->ops_.back().get(); + auto *op_handle = result->Get("ops").back().get(); op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); - for (auto &each_var_name : op.InputArgumentNames()) { - VarHandle *var = - CreateOrGetLatestVarHandle(result, each_var_name, p, place_id); + for (ir::Node *input : node->inputs) { + VarHandle *var = CreateOrGetLatestVarHandle(result, input, p, place_id); op_handle->AddInput(var); } - for (auto &each_var_name : op.OutputArgumentNames()) { - CreateOpOutput(result, op_handle, each_var_name, p, place_id); + for (ir::Node *output : node->outputs) { + ir::Node *new_node = nullptr; + if (output->Var()) { + new_node = result->CreateVarNode(output->Var()); + } else { + new_node = + result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable); + } + CreateOpOutput(result, op_handle, new_node, p, place_id); } } std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( - const ProgramDesc &program) const { + const std::vector> &nodes) const { std::vector send_vars; // since parameters are all in block 0, // it's enough to only scan send ops in block 0 - for (auto *op : program.Block(0).AllOps()) { + for (auto &node : nodes) { + if (node->NodeType() != ir::Node::Type::kOperation) continue; + OpDesc *op = node->Op(); // TODO(Yancey1989): use a graceful method to find send op, // instead of the the hard code string if (op->Type() == "send") { @@ -104,9 +112,11 @@ std::vector MultiDevSSAGraphBuilder::FindDistTrainSendVars( } std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( - const ProgramDesc &program) const { + const std::vector> &nodes) const { std::vector recv_vars; - for (auto *op : program.Block(0).AllOps()) { + for (auto &node : nodes) { + if (node->NodeType() != ir::Node::Type::kOperation) continue; + OpDesc *op = node->Op(); // TODO(Yancey1989): use a graceful method to find recv op, // instead of the hard code string if (op->Type() == "recv") { @@ -120,7 +130,7 @@ std::vector MultiDevSSAGraphBuilder::FindDistTrainRecvVars( } bool MultiDevSSAGraphBuilder::IsDistTrainOp( - const OpDesc &op, const std::vector &send_vars, + ir::Node *node, const std::vector &send_vars, const std::vector &recv_vars) const { if (send_vars.size() == 0 || recv_vars.size() == 0) { return false; @@ -143,8 +153,17 @@ bool MultiDevSSAGraphBuilder::IsDistTrainOp( return false; }; - return checker(op.OutputArgumentNames(), send_vars) || - checker(op.InputArgumentNames(), recv_vars); + std::vector input_var_names; + std::vector output_var_names; + for (ir::Node *input : node->inputs) { + input_var_names.push_back(input->Name()); + } + for (ir::Node *output : node->outputs) { + output_var_names.push_back(output->Name()); + } + + return checker(output_var_names, send_vars) || + checker(input_var_names, recv_vars); } size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( @@ -167,25 +186,30 @@ size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( return dev_id; } -std::unique_ptr MultiDevSSAGraphBuilder::Build( - const ProgramDesc &program) const { - for (auto *var : program.Block(0).AllVars()) { - all_vars_.emplace(var->Name(), var); +std::unique_ptr MultiDevSSAGraphBuilder::Apply( + std::unique_ptr graph) const { + // Rebuild the graph structure. + auto nodes = std::move(graph->nodes); + graph->nodes.clear(); + + for (auto &node : nodes) { + if (node->NodeType() == ir::Node::Type::kVariable) { + all_vars_.emplace(node->Name(), node->Var()); + } } - auto graph = new SSAGraph(); - SSAGraph &result = *graph; + Graph &result = *graph; std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 - result.vars_ = std::vector< - std::unordered_map>>>( - places_.size()); + result.Set("vars", new GraphVars(places_.size())); + result.Set("dep_vars", new GraphDepVars); + result.Set("ops", new GraphOps); // find send/recv vars so that we can place the distributed training // realted op in the place 0 - auto send_vars = FindDistTrainSendVars(program); - auto recv_vars = FindDistTrainRecvVars(program); + auto send_vars = FindDistTrainSendVars(nodes); + auto recv_vars = FindDistTrainRecvVars(nodes); std::vector> bcast_var_name_set; bcast_var_name_set.resize(places_.size()); @@ -193,14 +217,19 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( size_t cur_device_id = 0; bool is_forwarding = true; - for (auto *op : program.Block(0).AllOps()) { + // NOTE: Currently, passes before SSAGraphBuilder cannot reorder + // forward, backward nodes. E.g. you can't append an forward node + // at the end of the node list. + // TODO(panyx0718): FIXME: Needs to sort by forward->backward order. + for (auto &node : nodes) { + if (node->NodeType() != ir::Node::Type::kOperation) continue; if (boost::get( - op->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - CreateRPCOp(&result, *op); - } else if (IsDistTrainOp(*op, send_vars, recv_vars)) { - CreateDistTrainOp(&result, *op); - } else if (IsScaleLossOp(*op)) { + CreateRPCOp(&result, node.get()); + } else if (IsDistTrainOp(node.get(), send_vars, recv_vars)) { + CreateDistTrainOp(&result, node.get()); + } else if (IsScaleLossOp(node.get())) { // user can customize loss@grad if not use_default_grad_scale_ if (strategy_.gradient_scale_ != BuildStrategy::GradientScaleStrategy::kCustomized) { @@ -212,33 +241,35 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(*op); + int op_dev_id = GetOpDeviceID(node.get()); if (op_dev_id != -1) { // This op only runs on one specific device. - CreateComputationalOp(&result, *op, op_dev_id); - for (auto &var_name : op->OutputArgumentNames()) { - var_name_on_devices_.emplace(var_name, op_dev_id); + CreateComputationalOp(&result, node.get(), op_dev_id); + for (ir::Node *n : node->outputs) { + var_name_on_devices_.emplace(n->Name(), op_dev_id); } } else { // This op runs on all devices, and its output may have parameter's // gradients. - if (op->Type() == "read" && strategy_.enable_data_balance_) { - op->SetAttr("throw_eof_exp", false); - CreateComputationalOps(&result, *op, places_.size()); - const auto &data_var_names = op->Output("Out"); + if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) { + node->Op()->SetAttr("throw_eof_exp", false); + CreateComputationalOps(&result, node.get(), places_.size()); + // TODO(paddle-dev): builder shouldn't depend on the out logic of + // a specific op. + const auto &data_var_names = node->Op()->Output("Out"); InsertDataBalanceOp(&result, data_var_names); } else { - CreateComputationalOps(&result, *op, places_.size()); + CreateComputationalOps(&result, node.get(), places_.size()); } if (!is_forwarding && places_.size() > 1) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - if (static_cast(boost::get(op->GetAttr( + if (static_cast(boost::get(node->Op()->GetAttr( OpProtoAndCheckerMaker::OpRoleAttrName())) & static_cast(OpRole::kBackward))) { try { - auto backward_vars = - boost::get>(op->GetNullableAttr( + auto backward_vars = boost::get>( + node->Op()->GetNullableAttr( OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); @@ -302,8 +333,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); - - return std::unique_ptr(graph); + return std::move(graph); } bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { @@ -327,78 +357,96 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( #endif } -void MultiDevSSAGraphBuilder::CreateBroadcastOp(SSAGraph *result, +void MultiDevSSAGraphBuilder::CreateBroadcastOp(Graph *result, const std::string &p_name, size_t src_dev_id) const { #ifdef PADDLE_WITH_CUDA - auto *op_handle = new BroadcastOpHandle(local_scopes_, places_, nccl_ctxs_); + auto *op_handle = new BroadcastOpHandle( + result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), + local_scopes_, places_, nccl_ctxs_); #else - auto *op_handle = new BroadcastOpHandle(local_scopes_, places_); + auto *op_handle = new BroadcastOpHandle( + result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), + local_scopes_, places_); #endif + result->Get("ops").emplace_back(op_handle); - result->ops_.emplace_back(op_handle); - auto *in = result->vars_.at(src_dev_id).at(p_name).back().get(); + auto *in = + result->Get("vars").at(src_dev_id).at(p_name).back().get(); op_handle->AddInput(in); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->vars_.at(i).at(p_name); - auto *out_var = new VarHandle(vars.size(), i, p_name, p); + auto &vars = result->Get("vars").at(i).at(p_name); + auto *out_var = new VarHandle( + result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(), + i, p_name, p); vars.emplace_back(out_var); op_handle->AddOutput(out_var); } } -void MultiDevSSAGraphBuilder::CreateComputationalOp(SSAGraph *result, - const OpDesc &op, +void MultiDevSSAGraphBuilder::CreateComputationalOp(Graph *result, + ir::Node *node, int dev_id) const { - result->ops_.emplace_back( - new ComputationOpHandle(op, local_scopes_[dev_id], places_[dev_id])); - CreateOpHandleIOs(result, op, dev_id); + result->Get("ops").emplace_back( + new ComputationOpHandle(result->CreateOpNode(node->Op()), + local_scopes_[dev_id], places_[dev_id])); + CreateOpHandleIOs(result, node, dev_id); } -void MultiDevSSAGraphBuilder::InsertAllReduceOp(SSAGraph *result, +void MultiDevSSAGraphBuilder::InsertAllReduceOp(Graph *result, const std::string &og) const { #ifdef PADDLE_WITH_CUDA - result->ops_.emplace_back( - new AllReduceOpHandle(local_scopes_, places_, nccl_ctxs_)); + result->Get("ops").emplace_back(new AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + local_scopes_, places_, nccl_ctxs_)); #else - result->ops_.emplace_back(new AllReduceOpHandle(local_scopes_, places_)); + result->Get("ops").emplace_back(new AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + local_scopes_, places_)); #endif - auto *op_handle = result->ops_.back().get(); + auto *op_handle = result->Get("ops").back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->vars_[i][og]; + auto &vars = result->Get("vars")[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); - auto var = new VarHandle(vars.size(), i, og, p); + auto var = + new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), + vars.size(), i, og, p); vars.emplace_back(var); op_handle->AddOutput(var); } } void MultiDevSSAGraphBuilder::InsertDataBalanceOp( - SSAGraph *result, const std::vector &datas) const { + Graph *result, const std::vector &datas) const { #ifdef PADDLE_WITH_CUDA - result->ops_.emplace_back( - new DataBalanceOpHandle(local_scopes_, places_, nccl_ctxs_)); + result->Get("ops").emplace_back(new DataBalanceOpHandle( + result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), + local_scopes_, places_, nccl_ctxs_)); #else - result->ops_.emplace_back(new DataBalanceOpHandle(local_scopes_, places_)); + result->Get("ops").emplace_back(new DataBalanceOpHandle( + result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), + local_scopes_, places_)); #endif - auto *op_handle = result->ops_.back().get(); + auto *op_handle = result->Get("ops").back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); for (const std::string &d_name : datas) { - auto &vars = result->vars_[i][d_name]; + auto &vars = result->Get("vars")[i][d_name]; PADDLE_ENFORCE(!vars.empty()); op_handle->AddInput(vars.back().get()); - auto var = new VarHandle(vars.size(), i, d_name, p); + auto var = new VarHandle( + result->CreateEmptyNode(d_name, ir::Node::Type::kVariable), + vars.size(), i, d_name, p); vars.emplace_back(var); op_handle->AddOutput(var); } @@ -417,22 +465,22 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( return is_pg_once; } -int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const { +int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } int op_role = boost::get( - op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); + node->Op()->GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName())); if (op_role != static_cast(framework::OpRole::kOptimize)) { return -1; } auto param_grad = boost::get>( - op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); int dev_id = GetVarDeviceID(param_grad[1]); - PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(), - param_grad[0]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", + node->Op()->Type(), param_grad[0]); return dev_id; } @@ -441,7 +489,7 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { return got == var_name_on_devices_.end() ? -1 : got->second; } -void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { +void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(Graph *result) const { for (size_t i = 0; i < places_.size(); ++i) { // Insert ScaleCost OpHandle #ifdef PADDLE_WITH_CUDA @@ -452,11 +500,11 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { auto *communication_dev_ctx = platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); #endif - - auto *op_handle = - new ScaleLossGradOpHandle(local_scopes_.size(), local_scopes_[i], - places_[i], communication_dev_ctx); - result->ops_.emplace_back(op_handle); + auto *op_handle = new ScaleLossGradOpHandle( + result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), + local_scopes_.size(), local_scopes_[i], places_[i], + communication_dev_ctx); + result->Get("ops").emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale // factor. So it does not depend on any other operators. @@ -464,43 +512,51 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(SSAGraph *result) const { // loss->pending_ops_.emplace_back(op_handle); // op_handle->inputs_.emplace_back(loss); - CreateOpOutput(result, op_handle, GradVarName(loss_var_name_), places_[i], - i); + CreateOpOutput(result, op_handle, + result->CreateEmptyNode(GradVarName(loss_var_name_), + ir::Node::Type::kVariable), + places_[i], i); } } -void MultiDevSSAGraphBuilder::CreateComputationalOps(SSAGraph *result, - const OpDesc &op, +void MultiDevSSAGraphBuilder::CreateComputationalOps(Graph *result, + ir::Node *node, size_t num_places) const { for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->ops_.emplace_back(new ComputationOpHandle(op, s, p)); - CreateOpHandleIOs(result, op, scope_idx); + result->Get("ops").emplace_back( + new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); + CreateOpHandleIOs(result, node, scope_idx); } } -VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, +VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(Graph *result, const std::string &og, int dst_dev_id) const { #ifdef PADDLE_WITH_CUDA - result->ops_.emplace_back( - new ReduceOpHandle(local_scopes_, places_, nccl_ctxs_)); + result->Get("ops").emplace_back(new ReduceOpHandle( + result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), + local_scopes_, places_, nccl_ctxs_)); #else - result->ops_.emplace_back(new ReduceOpHandle(local_scopes_, places_)); + result->Get("ops").emplace_back(new ReduceOpHandle( + result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), + local_scopes_, places_)); #endif - auto *op_handle = result->ops_.back().get(); + auto *op_handle = result->Get("ops").back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->vars_[i][og]; + auto &vars = result->Get("vars")[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); } - auto &vars = result->vars_[dst_dev_id][og]; - auto var = new VarHandle(vars.size(), dst_dev_id, og, places_[dst_dev_id]); + auto &vars = result->Get("vars")[dst_dev_id][og]; + auto var = + new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), + vars.size(), dst_dev_id, og, places_[dst_dev_id]); vars.emplace_back(var); op_handle->AddOutput(var); return var; @@ -508,35 +564,46 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(SSAGraph *result, // Find the first occurence of `prev_op_name` and make current `op` depend // on it. -void MultiDevSSAGraphBuilder::ConnectOp(SSAGraph *result, OpHandleBase *op, +void MultiDevSSAGraphBuilder::ConnectOp(Graph *result, OpHandleBase *op, const std::string &prev_op_name) const { - for (auto &prev_op : result->ops_) { + for (auto &prev_op : result->Get("ops")) { if (prev_op->Name() == prev_op_name) { - auto *dep_var = new DummyVarHandle(); + auto *dep_var = new DummyVarHandle( + result->CreateEmptyNode("dummy", ir::Node::Type::kVariable)); prev_op->AddOutput(dep_var); - result->dep_vars_.emplace(dep_var); + result->Get("dep_vars").emplace(dep_var); op->AddInput(dep_var); } } } -void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, - const OpDesc &op) const { +void MultiDevSSAGraphBuilder::CreateDistTrainOp(Graph *result, + ir::Node *node) const { int op_dev_id = -1; - if (op.Type() == "split_byref" || op.Type() == "split_selected_rows") { - op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); + std::vector input_var_names; + std::vector output_var_names; + for (ir::Node *input : node->inputs) { + input_var_names.push_back(input->Name()); + } + for (ir::Node *output : node->outputs) { + output_var_names.push_back(output->Name()); + } + + if (node->Op()->Type() == "split_byref" || + node->Op()->Type() == "split_selected_rows") { + op_dev_id = GetVarDeviceID(input_var_names[0]); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); - for (auto &varname : op.InputArgumentNames()) { + op_dev_id = GetAppropriateDeviceID(input_var_names); + for (auto &varname : input_var_names) { var_name_on_devices_.emplace(varname, op_dev_id); } } - for (auto &varname : op.OutputArgumentNames()) { + for (auto &varname : output_var_names) { var_name_on_devices_.emplace(varname, op_dev_id); } - } else if (op.Type() == "concat") { - op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); - for (auto &varname : op.OutputArgumentNames()) { + } else if (node->Op()->Type() == "concat") { + op_dev_id = GetVarDeviceID(input_var_names[0]); + for (auto &varname : output_var_names) { var_name_on_devices_.emplace(varname, op_dev_id); } } else { @@ -546,34 +613,43 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(SSAGraph *result, } PADDLE_ENFORCE(op_dev_id != -1, - "can not find right place for distributed op: %s", op.Type()); + "can not find right place for distributed op: %s", + node->Op()->Type()); - CreateComputationalOp(result, op, op_dev_id); - if (op.Type() == "concat") { - ConnectOp(result, result->ops_.back().get(), "fetch_barrier"); + CreateComputationalOp(result, node, op_dev_id); + if (node->Op()->Type() == "concat") { + ConnectOp(result, result->Get("ops").back().get(), + "fetch_barrier"); } } // Create RPC related op handles that connects its in ops and out ops. -void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, - const OpDesc &op) const { +void MultiDevSSAGraphBuilder::CreateRPCOp(Graph *result, ir::Node *node) const { int op_dev_id = -1; - if (op.Type() == "send") { - op_dev_id = GetVarDeviceID(op.InputArgumentNames()[0]); + if (node->Op()->Type() == "send") { + op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); // the variable name which contains .block means it was splited by // split_byref op // so that we can balance the variable blocks to all the pserver // instances. if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce && - op.InputArgumentNames()[0].find(".block") == std::string::npos) { - op_dev_id = GetAppropriateDeviceID(op.InputArgumentNames()); - for (auto &varname : op.InputArgumentNames()) { + node->inputs[0]->Name().find(".block") == std::string::npos) { + std::vector input_var_names; + for (ir::Node *n : node->inputs) { + input_var_names.push_back(n->Name()); + } + op_dev_id = GetAppropriateDeviceID(input_var_names); + for (auto &varname : input_var_names) { var_name_on_devices_.emplace(varname, op_dev_id); } } - } else if (op.Type() == "recv") { - op_dev_id = GetAppropriateDeviceID(op.OutputArgumentNames()); - for (auto &varname : op.OutputArgumentNames()) { + } else if (node->Op()->Type() == "recv") { + std::vector output_var_names; + for (ir::Node *n : node->outputs) { + output_var_names.push_back(n->Name()); + } + op_dev_id = GetAppropriateDeviceID(output_var_names); + for (auto &varname : output_var_names) { var_name_on_devices_.emplace(varname, op_dev_id); } } else { @@ -582,18 +658,20 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, } PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", - op.Type()); - - result->ops_.emplace_back(new RPCOpHandle(op, local_scopes_[op_dev_id], - op.Type(), places_[op_dev_id])); - - if (op.Type() == "send_barrier") { - ConnectOp(result, result->ops_.back().get(), "send"); - } else if (op.Type() == "recv") { - ConnectOp(result, result->ops_.back().get(), "send_barrier"); - } else if (op.Type() == "fetch_barrier") { - ConnectOp(result, result->ops_.back().get(), "recv"); - } else if (op.Type() == "send") { + node->Op()->Type()); + + result->Get("ops").emplace_back(new RPCOpHandle( + result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], + node->Op()->Type(), places_[op_dev_id])); + + if (node->Op()->Type() == "send_barrier") { + ConnectOp(result, result->Get("ops").back().get(), "send"); + } else if (node->Op()->Type() == "recv") { + ConnectOp(result, result->Get("ops").back().get(), + "send_barrier"); + } else if (node->Op()->Type() == "fetch_barrier") { + ConnectOp(result, result->Get("ops").back().get(), "recv"); + } else if (node->Op()->Type() == "send") { // do nothing } else { PADDLE_THROW( @@ -601,12 +679,12 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(SSAGraph *result, "send, send_barrier. recv, fetch_barrier]"); } - CreateOpHandleIOs(result, op, op_dev_id); + CreateOpHandleIOs(result, node, op_dev_id); } -bool MultiDevSSAGraphBuilder::IsScaleLossOp(const OpDesc &op) const { +bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { return boost::get( - op.GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == (static_cast(OpRole::kBackward) | static_cast(OpRole::kLoss)) && !loss_var_name_.empty(); // If loss_var is empty. This is test mode diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index a964e024885e56693224a6199e00ff30beaa1df4..2b7f4f586b4e750fde9245286c977258a9db6086 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/ssa_graph_builder.h" +#include "paddle/fluid/framework/ir/graph.h" namespace paddle { namespace platform { @@ -45,13 +46,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector &local_scopes, const BuildStrategy &strategy); #endif - - std::unique_ptr Build(const ProgramDesc &program) const override; + std::unique_ptr Apply(std::unique_ptr graph) const override; int GetVarDeviceID(const std::string &varname) const override; private: - void CreateOpHandleIOs(SSAGraph *result, const OpDesc &op, - size_t device_id) const; + void CreateOpHandleIOs(Graph *result, ir::Node *node, size_t device_id) const; private: std::string loss_var_name_; @@ -63,48 +62,46 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { platform::NCCLContextMap *nccl_ctxs_; #endif - bool IsScaleLossOp(const OpDesc &op) const; + bool IsScaleLossOp(ir::Node *node) const; - void CreateRPCOp(SSAGraph *result, const OpDesc &op) const; - void CreateDistTrainOp(SSAGraph *result, const OpDesc &op) const; + void CreateRPCOp(Graph *result, ir::Node *node) const; + void CreateDistTrainOp(Graph *result, ir::Node *node) const; /** * Is this operator as the end-point operator before/after send operator. */ - bool IsDistTrainOp(const OpDesc &op, - const std::vector &send_vars, + bool IsDistTrainOp(ir::Node *node, const std::vector &send_vars, const std::vector &recv_vars) const; std::vector FindDistTrainSendVars( - const ProgramDesc &program) const; + const std::vector> &nodes) const; std::vector FindDistTrainRecvVars( - const ProgramDesc &program) const; + const std::vector> &nodes) const; - void ConnectOp(SSAGraph *result, OpHandleBase *op, + void ConnectOp(Graph *result, OpHandleBase *op, const std::string &prev_op_name) const; - void CreateComputationalOps(SSAGraph *result, const OpDesc &op, + void CreateComputationalOps(Graph *result, ir::Node *node, size_t num_places) const; - void CreateScaleLossGradOp(SSAGraph *result) const; - VarHandle *CreateReduceOp(SSAGraph *result, const std::string &og, + void CreateScaleLossGradOp(Graph *result) const; + VarHandle *CreateReduceOp(Graph *result, const std::string &og, int dst_dev_id) const; - void CreateComputationalOp(SSAGraph *result, const OpDesc &op, - int dev_id) const; + void CreateComputationalOp(Graph *result, ir::Node *node, int dev_id) const; bool IsParameterGradientOnce( const std::string &og, std::unordered_set *og_has_been_broadcast) const; - int GetOpDeviceID(const OpDesc &op) const; + int GetOpDeviceID(ir::Node *node) const; - void InsertAllReduceOp(SSAGraph *result, const std::string &og) const; + void InsertAllReduceOp(Graph *result, const std::string &og) const; - void InsertDataBalanceOp(SSAGraph *result, + void InsertDataBalanceOp(Graph *result, const std::vector &datas) const; - void CreateBroadcastOp(SSAGraph *result, const std::string &p_name, + void CreateBroadcastOp(Graph *result, const std::string &p_name, size_t src_dev_id) const; bool IsSparseGradient(const std::string &og) const; diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index d80bdcf15d798925c137460125964d3d7e65f67e..ee9f9184da65467b82794c99fe3e95b108373753 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -80,19 +80,21 @@ void OpHandleBase::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { void OpHandleBase::AddInput(VarHandleBase *in) { this->inputs_.emplace_back(in); - in->pending_ops_.insert(this); + node_->inputs.push_back(in->Node()); + in->AddOutput(this, this->Node()); } void OpHandleBase::AddOutput(VarHandleBase *out) { outputs_.emplace_back(out); - out->generated_op_ = this; + node_->outputs.push_back(out->Node()); + out->AddInput(this, this->Node()); } void OpHandleBase::WaitInputVarGenerated() { for (auto in_var : inputs_) { if (NeedWait(in_var)) { for (auto &pair : dev_ctxes_) { - in_var->generated_op_->RecordWaitEventOnCtx(pair.second); + in_var->GeneratedOp()->RecordWaitEventOnCtx(pair.second); } } } @@ -101,7 +103,7 @@ void OpHandleBase::WaitInputVarGenerated() { void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) { for (auto *in : inputs_) { if (NeedWait(in)) { - in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[place]); + in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[place]); } } } @@ -117,7 +119,7 @@ size_t OpHandleBase::NoDummyInputSize() const { } bool OpHandleBase::NeedWait(VarHandleBase *in_var) { - return in_var && in_var->generated_op_; + return in_var && in_var->GeneratedOp(); } void OpHandleBase::RunAndRecordEvent(const std::function &callback) { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 6aec178831161f8ac1306fc3ed72e3267ca3c7e5..2d7f18942890245249dd0619a40bb43833c9a2ee 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -17,6 +17,7 @@ #include #include #include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/macros.h" @@ -26,9 +27,11 @@ namespace details { constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; +// Wraps ir::Node and provide helper utilities. +// It's responsible for populating necessary fields of ir::Node. class OpHandleBase { public: - OpHandleBase() {} + explicit OpHandleBase(ir::Node *node) : node_(node) {} virtual ~OpHandleBase(); @@ -82,6 +85,8 @@ class OpHandleBase { size_t NoDummyInputSize() const; + ir::Node *Node() { return node_; } + protected: void RunAndRecordEvent(const std::function &callback); @@ -90,6 +95,7 @@ class OpHandleBase { virtual void RunImpl() = 0; + ir::Node *node_; std::vector inputs_; std::vector outputs_; std::map dev_ctxes_; diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 4d14334cdfe06e2e805c2577458d6689e6324cc7..a6289b055f97b7b0e57928358d84117b33cf2df8 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -37,10 +37,13 @@ struct ReduceOpHandle : public OpHandleBase { #ifdef PADDLE_WITH_CUDA const platform::NCCLContextMap *nccl_ctxs_; - ReduceOpHandle(const std::vector &local_scopes, + ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *nccl_ctxs) - : local_scopes_(local_scopes), places_(places), nccl_ctxs_(nccl_ctxs) { + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + nccl_ctxs_(nccl_ctxs) { if (nccl_ctxs_) { for (auto &p_ctx : nccl_ctxs_->contexts_) { dev_ctxes_[platform::CUDAPlace(p_ctx.first)] = p_ctx.second.ctx_.get(); @@ -48,9 +51,9 @@ struct ReduceOpHandle : public OpHandleBase { } } #else - ReduceOpHandle(const std::vector &local_scopes, + ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places) - : local_scopes_(local_scopes), places_(places) {} + : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} #endif std::string Name() const override; diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index ffdd7c14eb5097cc8285da090e4a72e1e3f43d86..3a9a58412391b188c5e804b41fa47b3607a36bd1 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -84,6 +84,7 @@ struct TestReduceOpHandle { } void InitReduceOp(size_t out_scope_idx) { + std::vector> nodes; // init scope for (size_t j = 0; j < gpu_list_.size(); ++j) { local_scopes_.push_back(&(g_scope_.NewScope())); @@ -96,19 +97,21 @@ struct TestReduceOpHandle { } param_scopes_[out_scope_idx]->Var("out"); + nodes.emplace_back(new ir::Node("node")); if (use_gpu_) { #ifdef PADDLE_WITH_CUDA - op_handle_.reset( - new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); + op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, + gpu_list_, nccl_ctxs_.get())); #else PADDLE_THROW("CUDA is not support."); #endif } else { #ifdef PADDLE_WITH_CUDA - op_handle_.reset( - new ReduceOpHandle(local_scopes_, gpu_list_, nccl_ctxs_.get())); + op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, + gpu_list_, nccl_ctxs_.get())); #else - op_handle_.reset(new ReduceOpHandle(local_scopes_, gpu_list_)); + op_handle_.reset( + new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_)); #endif } @@ -118,8 +121,10 @@ struct TestReduceOpHandle { if (!use_gpu_) { op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); } - auto *in_var_handle = new VarHandle(1, j, "input", gpu_list_[j]); - in_var_handle->generated_op_ = nullptr; + nodes.emplace_back(new ir::Node("node1")); + auto *in_var_handle = + new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]); + in_var_handle->ClearGeneratedOp(); vars_.emplace_back(in_var_handle); op_handle_->AddInput(in_var_handle); } @@ -128,12 +133,13 @@ struct TestReduceOpHandle { vars_.emplace_back(new DummyVarHandle()); DummyVarHandle *in_dummy_var_handle = static_cast(vars_.back().get()); - in_dummy_var_handle->generated_op_ = nullptr; + in_dummy_var_handle->ClearGeneratedOp(); op_handle_->AddInput(in_dummy_var_handle); // add output - auto *out_var_handle = - new VarHandle(2, out_scope_idx, "out", gpu_list_[out_scope_idx]); + nodes.emplace_back(new ir::Node("node2")); + auto *out_var_handle = new VarHandle(nodes.back().get(), 2, out_scope_idx, + "out", gpu_list_[out_scope_idx]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); diff --git a/paddle/fluid/framework/details/rpc_op_handle.cc b/paddle/fluid/framework/details/rpc_op_handle.cc index 586465f99fd94117c821be2952bffda385fbcf75..924ff4d118a192a43e5828a38fd1abbaac1a8526 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.cc +++ b/paddle/fluid/framework/details/rpc_op_handle.cc @@ -18,10 +18,11 @@ namespace paddle { namespace framework { namespace details { -RPCOpHandle::RPCOpHandle(const framework::OpDesc &op_desc, +RPCOpHandle::RPCOpHandle(ir::Node *node, const framework::OpDesc &op_desc, const Scope *local_scope, const std::string &name, const platform::Place &place) - : op_(framework::OpRegistry::CreateOp(op_desc)), + : OpHandleBase(node), + op_(framework::OpRegistry::CreateOp(op_desc)), local_scope_(local_scope), name_(name), place_(place) {} @@ -35,8 +36,8 @@ void RPCOpHandle::RunImpl() { if (in->DebugString() == "dummy") { // HACK continue; } - if (in->generated_op_) { - in->generated_op_->RecordWaitEventOnCtx(dev_ctxes_[p]); + if (in->GeneratedOp()) { + in->GeneratedOp()->RecordWaitEventOnCtx(dev_ctxes_[p]); } } auto &tmp_scope = local_scope_->FindVar(kLocalExecScopeName)->Get(); diff --git a/paddle/fluid/framework/details/rpc_op_handle.h b/paddle/fluid/framework/details/rpc_op_handle.h index ae38c7fe19e102a330455d89a1068414a7835fab..7f99cdeacf618a9496eaef98520685d6d1621ae1 100644 --- a/paddle/fluid/framework/details/rpc_op_handle.h +++ b/paddle/fluid/framework/details/rpc_op_handle.h @@ -28,8 +28,9 @@ namespace framework { namespace details { struct RPCOpHandle : public OpHandleBase { - RPCOpHandle(const framework::OpDesc& op_desc, const Scope* local_scope, - const std::string& name, const platform::Place& place); + RPCOpHandle(ir::Node* node, const framework::OpDesc& op_desc, + const Scope* local_scope, const std::string& name, + const platform::Place& place); std::string Name() const override; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index d9c387e79dc71288e7330597fed57171d447f31b..609e18581957f62b040e04e937873b7a8fa5785a 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -19,10 +19,14 @@ namespace paddle { namespace framework { namespace details { -ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope, +ScaleLossGradOpHandle::ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, + Scope *scope, platform::Place place, platform::DeviceContext *dev_ctx) - : coeff_(static_cast(1.0 / num_dev)), scope_(scope), place_(place) { + : OpHandleBase(node), + coeff_(static_cast(1.0 / num_dev)), + scope_(scope), + place_(place) { dev_ctxes_[place_] = dev_ctx; } diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index d93d599d46f130cf98f39f15697ce994a31e20c3..523b55724c82d4e2bef0520c10e5708c952a3ecc 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -25,7 +25,8 @@ namespace framework { namespace details { struct ScaleLossGradOpHandle : public OpHandleBase { - ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place, + ScaleLossGradOpHandle(ir::Node *node, size_t num_dev, Scope *scope, + platform::Place place, platform::DeviceContext *context); ~ScaleLossGradOpHandle() final; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index 20df7a4722d589ffd168f842e927cff8411096bb..cbfbcb1c0cd24f16773f9633310166371600790c 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -17,6 +17,9 @@ #include #include #include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" + #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" #include "paddle/fluid/framework/scope.h" diff --git a/paddle/fluid/framework/details/ssa_graph.cc b/paddle/fluid/framework/details/ssa_graph.cc deleted file mode 100644 index 1b8c889449059c563ea39f86250075ac2537cdbe..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/ssa_graph.cc +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/ssa_graph.h" diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h deleted file mode 100644 index e996a00c162186e47e77d007503ac67caa9f8024..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/ssa_graph.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/details/var_handle.h" - -namespace paddle { -namespace framework { -namespace details { - -// A SSA graph used by parallel executor. -struct SSAGraph { - // all variable in each devices. - // The outside vector is the device vector. Each element of this vector is a - // map from variable name to variables. The variables, who have the same name, - // will have a different version. The offset in the - // `std::vector>` is the version of varaibles. - std::vector< - std::unordered_map>>> - vars_; - - // aux variables to represent dependency. Useful to resolve data hazard. - std::unordered_set> dep_vars_; - - // all operators. NOTE that even we use a vector here, the operators is - // unordered. - std::vector> ops_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 88a21f48879a15450051ad94ed76e1c48bf23014..7bc130ef6e8d2e0caf6e445d12950b87e6dd4dbd 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -17,8 +17,8 @@ namespace paddle { namespace framework { namespace details { -void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { - for (auto &var_map : graph->vars_) { +void SSAGraphBuilder::PolishGraphToSupportDataHazards(Graph *graph) { + for (auto &var_map : graph->Get("vars")) { for (auto &name_pair : var_map) { if (name_pair.second.size() <= 1) { continue; @@ -27,8 +27,8 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { auto it_old = name_pair.second.rbegin(); ++it_old; for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { - auto *write_op = (*it_new)->generated_op_; - auto &read_ops = (*it_old)->pending_ops_; + OpHandleBase *write_op = (*it_new)->GeneratedOp(); + const auto &read_ops = (*it_old)->PendingOps(); for (auto *read_op : read_ops) { // Manually add a dependency var from read_op to write_op; @@ -37,10 +37,11 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { continue; } - auto *dep_var = new DummyVarHandle(); + auto *dep_var = new DummyVarHandle( + graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable)); read_op->AddOutput(dep_var); write_op->AddInput(dep_var); - graph->dep_vars_.emplace(dep_var); + graph->Get("dep_vars").emplace(dep_var); } } } @@ -48,13 +49,20 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { } VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( - SSAGraph *graph, const std::string &each_var_name, - const platform::Place &place, size_t place_offset) { - auto &var_holders = graph->vars_[place_offset]; - auto &var_holder = var_holders[each_var_name]; + Graph *graph, ir::Node *node, const platform::Place &place, + size_t place_offset) { + auto &var_holders = graph->Get("vars")[place_offset]; + auto &var_holder = var_holders[node->Name()]; VarHandle *var = nullptr; if (var_holder.empty()) { - var = new VarHandle(0, place_offset, each_var_name, place); + if (node->Var()) { + var = new VarHandle(graph->CreateVarNode(node->Var()), 0, place_offset, + node->Name(), place); + } else { + var = new VarHandle( + graph->CreateEmptyNode(node->Name(), ir::Node::Type::kVariable), 0, + place_offset, node->Name(), place); + } var_holder.emplace_back(var); } else { var = var_holder.rbegin()->get(); @@ -62,24 +70,26 @@ VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( return var; } -void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, - const std::string &each_var_name, +void SSAGraphBuilder::CreateOpOutput(Graph *graph, OpHandleBase *op_handle, + ir::Node *new_node, const platform::Place &place, size_t place_offset) { - auto &vars = graph->vars_[place_offset][each_var_name]; + auto &vars = graph->Get("vars")[place_offset][new_node->Name()]; size_t version = vars.size(); - auto var = new VarHandle(version, place_offset, each_var_name, place); + auto var = + new VarHandle(new_node, version, place_offset, new_node->Name(), place); vars.emplace_back(var); op_handle->AddOutput(var); } -void SSAGraphBuilder::AddOutputToLeafOps(SSAGraph *graph) { - for (auto &op : graph->ops_) { +void SSAGraphBuilder::AddOutputToLeafOps(Graph *graph) { + for (auto &op : graph->Get("ops")) { if (!op->Outputs().empty()) { continue; } - auto *dummy_leaf = new DummyVarHandle(); - graph->dep_vars_.emplace(dummy_leaf); + auto *dummy_leaf = new DummyVarHandle( + graph->CreateEmptyNode("dummy", ir::Node::Type::kVariable)); + graph->Get("dep_vars").emplace(dummy_leaf); op->AddOutput(dummy_leaf); } } diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 18612c3c1b62cf4c2ebdc221c301c59ec81c2da7..e8e8acdb38f893302fb92c47d6f1cb2d38453e0f 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -16,20 +16,42 @@ #include #include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" -#include "paddle/fluid/framework/details/ssa_graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/place.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + namespace paddle { namespace framework { namespace details { -class SSAGraphBuilder { +// all variable in each devices. +// The outside vector is the device vector. Each element of this vector is a +// map from variable name to variables. The variables, who have the same name, +// will have a differsent version. The offset in the +// `std::vector>` is the version of varaibles. +typedef std::vector< + std::unordered_map>>> + GraphVars; + +// aux variables to represent dependency. Useful to resolve data hazard. +typedef std::unordered_set> GraphDepVars; + +// all operators. NOTE that even we use a vector here, the operators is +// unordered. +typedef std::vector> GraphOps; + +class SSAGraphBuilder : public ir::Pass { public: SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {} - virtual std::unique_ptr Build(const ProgramDesc &program) const = 0; + virtual int GetVarDeviceID(const std::string &var_name) const = 0; DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); @@ -42,20 +64,19 @@ class SSAGraphBuilder { * * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) */ - static void PolishGraphToSupportDataHazards(SSAGraph *graph); + static void PolishGraphToSupportDataHazards(Graph *graph); - static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph, - const std::string &each_var_name, + static VarHandle *CreateOrGetLatestVarHandle(Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset); // Add an output variable (each_var_name, place, place_offset) to op_handle, // which belongs to graph - static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, - const std::string &each_var_name, - const platform::Place &place, size_t place_offset); + static void CreateOpOutput(Graph *graph, OpHandleBase *op_handle, + ir::Node *new_node, const platform::Place &place, + size_t place_offset); - static void AddOutputToLeafOps(SSAGraph *graph); + static void AddOutputToLeafOps(Graph *graph); }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc index da5428946ee588e8eac1f78929dc0432df532975..7c79d7f1e881c67514634d56caa715c41927dbce 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.cc +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -12,15 +12,15 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/ssa_graph.h" -#include #include "paddle/fluid/framework/details/ssa_graph_checker.h" +#include +#include "paddle/fluid/framework/ir/graph.h" namespace paddle { namespace framework { namespace details { -bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const { +bool SSAGraghBuilderWithChecker::IsValidGraph(const Graph *graph) const { std::unordered_map pending_ops; std::unordered_set pending_vars; std::unordered_set ready_vars; @@ -28,12 +28,12 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const { auto insert_pending_var = [&](VarHandleBase *var) { pending_vars.insert(var); - if (var->generated_op_ == nullptr) { + if (var->GeneratedOp() == nullptr) { ready_vars.emplace(var); } }; - for (auto &var_map : graph->vars_) { + for (auto &var_map : graph->Get("vars")) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { insert_pending_var(version_pair.get()); @@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const { } } - for (auto &var : graph->dep_vars_) { + for (auto &var : graph->Get("dep_vars")) { insert_pending_var(var.get()); } - for (auto &op : graph->ops_) { + for (auto &op : graph->Get("ops")) { if (op->Inputs().empty()) { ready_ops.insert(op.get()); } else { @@ -71,7 +71,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const SSAGraph *graph) const { for (auto ready_var : ready_vars) { pending_vars.erase(ready_var); - for (auto *op : ready_var->pending_ops_) { + for (auto *op : ready_var->PendingOps()) { auto &deps = --pending_ops[op]; if (deps == 0) { ready_ops.insert(op); diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index 331aa9d2b5864c470dbd5e29ef6faccffdcf781c..f1080610381128325ea0affba760ac66798fd948 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -21,7 +21,6 @@ namespace paddle { namespace framework { namespace details { -struct SSAGraph; class SSAGraghBuilderWithChecker : public SSAGraphBuilder { public: @@ -29,17 +28,17 @@ class SSAGraghBuilderWithChecker : public SSAGraphBuilder { std::unique_ptr&& builder) : builder_(std::move(builder)) {} - std::unique_ptr Build(const ProgramDesc& program) const override { - auto graph = builder_->Build(program); - PADDLE_ENFORCE(IsValidGraph(graph.get())); - return graph; + std::unique_ptr Apply(std::unique_ptr graph) const override { + auto new_graph = builder_->Apply(std::move(graph)); + PADDLE_ENFORCE(IsValidGraph(new_graph.get())); + return std::move(new_graph); } int GetVarDeviceID(const std::string& var_name) const override { return builder_->GetVarDeviceID(var_name); } - bool IsValidGraph(const SSAGraph* graph) const; + bool IsValidGraph(const Graph* graph) const; private: std::unique_ptr builder_; diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index 958086033607a4ed8fb840f5b14fe5779625bd82..8815ec89b23bc874471eefde5fa855cd2a4bde1f 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -18,8 +18,8 @@ #include #include -#include "paddle/fluid/framework/details/ssa_graph.h" #include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/graph.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc index 22a40ca4b25cdd8ed9856b6c71bffc79561edcac..6dd6fd262e35a192ba85eb3aa16660526d2ebca2 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.cc +++ b/paddle/fluid/framework/details/ssa_graph_printer.cc @@ -14,15 +14,15 @@ #include "paddle/fluid/framework/details/ssa_graph_printer.h" #include -#include "paddle/fluid/framework/details/ssa_graph.h" +#include "paddle/fluid/framework/ir/graph.h" namespace paddle { namespace framework { namespace details { template -static inline void IterAllVar(const SSAGraph &graph, Callback callback) { - for (auto &each : graph.vars_) { +static inline void IterAllVar(const Graph &graph, Callback callback) { + for (auto &each : graph.Get("vars")) { for (auto &pair1 : each) { for (auto &pair2 : pair1.second) { callback(*pair2); @@ -30,12 +30,12 @@ static inline void IterAllVar(const SSAGraph &graph, Callback callback) { } } - for (auto &var : graph.dep_vars_) { + for (auto &var : graph.Get("dep_vars")) { callback(*var); } } -void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph, +void GraphvizSSAGraphPrinter::Print(const Graph &graph, std::ostream &sout) const { size_t var_id = 0; std::unordered_map vars; @@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const SSAGraph &graph, }); size_t op_id = 0; - for (auto &op : graph.ops_) { + for (auto &op : graph.Get("ops")) { std::string op_name = "op_" + std::to_string(op_id++); sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" << std::endl; diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index 09b0333ef2cb43a306133aa5af98d37c11454d4d..411be02988a82b3e35d56833f92fc6fe405a2c3d 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -21,16 +21,16 @@ namespace paddle { namespace framework { namespace details { -struct SSAGraph; + class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} - virtual void Print(const SSAGraph& graph, std::ostream& sout) const = 0; + virtual void Print(const Graph& graph, std::ostream& sout) const = 0; }; class GraphvizSSAGraphPrinter : public SSAGraphPrinter { public: - void Print(const SSAGraph& graph, std::ostream& sout) const override; + void Print(const Graph& graph, std::ostream& sout) const override; }; class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { @@ -50,10 +50,10 @@ class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { stream_ptr_(std::move(sout)), stream_ref_(*stream_ptr_) {} - std::unique_ptr Build(const ProgramDesc& program) const override { - auto graph = builder_->Build(program); - printer_->Print(*graph, stream_ref_); - return graph; + std::unique_ptr Apply(std::unique_ptr graph) const override { + auto new_graph = builder_->Apply(std::move(graph)); + printer_->Print(*new_graph, stream_ref_); + return std::move(new_graph); } int GetVarDeviceID(const std::string& var_name) const override { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index bad9a6b2ba1cb2f61a1314c0bb59b9d35858d23f..f85c62dd6c4a8033a037b1e001ece6a9cc90ca98 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -14,13 +14,14 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + namespace paddle { namespace framework { namespace details { ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, - const std::vector &places, - std::unique_ptr &&graph) + const std::vector &places, std::unique_ptr &&graph) : graph_(std::move(graph)), pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) : nullptr), @@ -43,18 +44,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( std::unordered_set delayed_ops; // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_->vars_) { + for (auto &var_map : graph_->Get("vars")) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { InsertPendingVar(&pending_vars, &ready_vars, version_pair.get()); } } } - for (auto &var : graph_->dep_vars_) { + for (auto &var : graph_->Get("dep_vars")) { InsertPendingVar(&pending_vars, &ready_vars, var.get()); } - for (auto &op : graph_->ops_) { + for (auto &op : graph_->Get("ops")) { if (op->Inputs().empty()) { // Special case, Op has no input. ready_ops.insert(op.get()); } else { @@ -64,11 +65,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Step 2. Insert FetchOps std::vector> fetch_ops; + std::vector> tmp_nodes; std::unordered_set> fetch_dependencies; FeedFetchList fetch_data(fetch_tensors.size()); - InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, - &pending_vars, &ready_vars, &fetch_data); + InsertFetchOps(fetch_tensors, &fetch_ops, &tmp_nodes, &fetch_dependencies, + &pending_ops, &pending_vars, &ready_vars, &fetch_data); auto run_all_ops = [&](std::unordered_set &set) { for (auto *op : set) { @@ -125,7 +127,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Find the ready_ops after the ready_var. for (auto ready_var : cur_ready_vars) { pending_vars.erase(ready_var); - for (auto *op : ready_var->pending_ops_) { + for (auto *op : ready_var->PendingOps()) { auto &deps = pending_ops[op]; --deps; if (deps == 0) { @@ -151,6 +153,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( void ThreadedSSAGraphExecutor::InsertFetchOps( const std::vector &fetch_tensors, std::vector> *fetch_ops, + std::vector> *temp_nodes, std::unordered_set> *fetch_dependencies, std::unordered_map *pending_ops, std::unordered_set *pending_vars, @@ -158,7 +161,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( std::unordered_map> fetched_vars; for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : graph_->vars_) { + for (auto &var_map : graph_->Get("vars")) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get()); @@ -168,14 +171,16 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( for (size_t i = 0; i < fetch_tensors.size(); ++i) { auto &var_name = fetch_tensors[i]; - auto fetched_var_it = fetched_vars.find(var_name); PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(), "Cannot find fetched variable.(Perhaps the main_program " "is not set to ParallelExecutor)"); auto &vars = fetched_var_it->second; - auto *op = new FetchOpHandle(fetch_data, i, &local_scopes_); + + temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation)); + auto *op = new FetchOpHandle(temp_nodes->back().get(), fetch_data, i, + &local_scopes_); fetch_ops->emplace_back(op); for (auto &p : places_) { @@ -186,7 +191,8 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( op->AddInput(var); } - auto *fetch_dummy = new DummyVarHandle(); + temp_nodes->emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation)); + auto *fetch_dummy = new DummyVarHandle(temp_nodes->back().get()); op->AddOutput(fetch_dummy); fetch_dependencies->emplace(fetch_dummy); this->InsertPendingVar(pending_vars, ready_vars, fetch_dummy); @@ -204,7 +210,7 @@ void ThreadedSSAGraphExecutor::InsertPendingVar( std::unordered_set *pending_vars, BlockingQueue *ready_vars, VarHandleBase *var) const { pending_vars->insert(var); - if (var->generated_op_ == nullptr) { + if (var->GeneratedOp() == nullptr) { ready_vars->Push(var); } } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 09973b7a72881464ad9e7776d4aad3d2261a118d..bf7c0a367a19ff4ac9462334516f1577672faa68 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -27,6 +27,7 @@ #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" +#include "paddle/fluid/framework/ir/graph.h" namespace paddle { namespace framework { @@ -39,7 +40,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, - std::unique_ptr &&graph); + std::unique_ptr &&graph); // Run a SSAGraph by a thread pool // Use topological sort algorithm @@ -52,7 +53,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { details::OpHandleBase *op); private: - std::unique_ptr graph_; + std::unique_ptr graph_; std::unique_ptr<::ThreadPool> pool_; std::vector local_scopes_; std::vector places_; @@ -71,6 +72,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { void InsertFetchOps( const std::vector &fetch_tensors, std::vector> *fetch_ops, + std::vector> *temp_nodes, std::unordered_set> *fetch_dependencies, std::unordered_map *pending_ops, std::unordered_set *pending_vars, diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index cae9af7217660fb7e4b8535ee8e022fb3a127668..d8c2bc40b9458a1d5a7dd8a32277d04f69295f09 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -13,11 +13,14 @@ // limitations under the License. #pragma once + +#include #include #include #include #include +#include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -25,19 +28,60 @@ namespace framework { namespace details { class OpHandleBase; +// Wraps ir::Node and provide helper utilities. +// It's responsible for populating necessary fields of ir::Node. +// // VarHandleBase is the var node in the dependency graph. // A variable can only be generated by a single operator. i.e. // This is a single assignment graph. struct VarHandleBase { + explicit VarHandleBase(ir::Node* node) : node_(node) {} + virtual ~VarHandleBase(); + virtual std::string DebugString() const = 0; + void AddInput(OpHandleBase* in, ir::Node* node) { + node_->inputs.clear(); + node_->inputs.push_back(node); + generated_op_ = in; + } + + void AddOutput(OpHandleBase* out, ir::Node* node) { + if (pending_ops_.find(out) == pending_ops_.end()) { + pending_ops_.insert(out); + node_->outputs.push_back(node); + } + } + + void RemoveOutput(OpHandleBase* out, ir::Node* node) { + pending_ops_.erase(out); + node_->outputs.erase( + std::remove(node_->outputs.begin(), node_->outputs.end(), node), + node_->outputs.end()); + } + + void ClearGeneratedOp() { + generated_op_ = nullptr; + node_->inputs.clear(); + } + + OpHandleBase* GeneratedOp() { return generated_op_; } + + const std::unordered_set& PendingOps() const { + return pending_ops_; + } + + ir::Node* Node() { return node_; } + + protected: // The operator who generate this variable. nullptr if the variable // is a root node. OpHandleBase* generated_op_{nullptr}; // Operators which depend on this variable ready. std::unordered_set pending_ops_; + ir::Node* node_; }; // VarHandle is actually a single version of Runtime Variable. @@ -46,11 +90,14 @@ struct VarHandleBase { // // NOTE: runtime variables have place. struct VarHandle : public VarHandleBase { + explicit VarHandle(ir::Node* node) : VarHandleBase(node) {} + std::string DebugString() const override; - VarHandle(size_t version, size_t scope_index, std::string name, - platform::Place place) - : version_(version), + VarHandle(ir::Node* node, size_t version, size_t scope_index, + std::string name, platform::Place place) + : VarHandleBase(node), + version_(version), scope_idx_(scope_index), name_(std::move(name)), place_(std::move(place)) {} @@ -70,6 +117,8 @@ struct VarHandle : public VarHandleBase { // Dummy Variable. It is used to represent dependencies between operators struct DummyVarHandle : public VarHandleBase { + explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {} + std::string DebugString() const override; }; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8ed06aa69c7f08b0600aa87cd482469ec78dfa3 --- /dev/null +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -0,0 +1,5 @@ +cc_library(graph SRCS graph.cc node) +cc_library(node SRCS node.cc) +cc_library(pass SRCS pass.cc graph node) + +cc_test(graph_test SRCS graph_test.cc DEPS graph proto_desc op_registry) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc new file mode 100644 index 0000000000000000000000000000000000000000..688f7ba5825bf1a1ab65a0912663481913223e80 --- /dev/null +++ b/paddle/fluid/framework/ir/graph.cc @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { + +// NOTE(paddle-dev): This graph contains circle. +Graph::Graph(const ProgramDesc &program) : program_(program) { + std::unordered_map all_vars; + for (auto *var : program.Block(0).AllVars()) { + all_vars.emplace(var->Name(), var); + } + + std::map var_nodes; + for (auto *op : program.Block(0).AllOps()) { + ir::Node *node = CreateOpNode(op); + + for (auto &each_var_name : op->InputArgumentNames()) { + ir::Node *var = nullptr; + if (var_nodes.find(each_var_name) != var_nodes.end()) { + var = var_nodes.at(each_var_name); + } else if (all_vars.count(each_var_name) != 0) { + var = CreateVarNode(all_vars.at(each_var_name)); + var_nodes[each_var_name] = var; + } else { + // TODO(paddle-dev): Seems some assumption doesn't hold? + LOG(ERROR) << op->Type() + << " input var not in all_var list: " << each_var_name; + var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable); + var_nodes[each_var_name] = var; + } + node->inputs.push_back(var); + var->outputs.push_back(node); + } + + for (auto &each_var_name : op->OutputArgumentNames()) { + ir::Node *var = nullptr; + if (var_nodes.find(each_var_name) != var_nodes.end()) { + var = var_nodes.at(each_var_name); + } else { + var = CreateVarNode(all_vars.at(each_var_name)); + var_nodes[each_var_name] = var; + } + node->outputs.push_back(var); + var->inputs.push_back(node); + } + } +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..b4ac135b029005b723abca2cb9b9a9aa175eda40 --- /dev/null +++ b/paddle/fluid/framework/ir/graph.h @@ -0,0 +1,82 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/variant.h" + +namespace paddle { +namespace framework { + +class Graph { + public: + explicit Graph(const ProgramDesc& program); + + virtual ~Graph() { + for (auto& attr : attrs_) { + attr_dels_[attr.first](); + } + attrs_.clear(); + attr_dels_.clear(); + } + + template + AttrType& Get(const std::string& attr_name) const { + return *boost::any_cast(attrs_.at(attr_name)); + } + + template + void Set(const std::string& attr_name, AttrType* attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + attrs_[attr_name] = attr; + attr_dels_[attr_name] = [attr, attr_name]() { + VLOG(3) << "deleting " << attr_name; + delete attr; + }; + } + + ir::Node* CreateVarNode(VarDesc* var_desc) { + nodes.emplace_back(new ir::Node(var_desc)); + return nodes.back().get(); + } + + ir::Node* CreateOpNode(OpDesc* op_desc) { + nodes.emplace_back(new ir::Node(op_desc)); + return nodes.back().get(); + } + + ir::Node* CreateEmptyNode(const std::string& name, ir::Node::Type type) { + nodes.emplace_back(new ir::Node(name, type)); + return nodes.back().get(); + } + + std::vector> nodes; + + private: + // NOTE: program_ shouldn't be exposed to user. + const ProgramDesc& program_; + std::map attrs_; + std::map> attr_dels_; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e23bf124f8822e25be0f6b1c7c8c5de4e4f600a --- /dev/null +++ b/paddle/fluid/framework/ir/graph_test.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/graph.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class NOP : public OperatorBase { + public: + NOP(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope &scope, + const platform::Place &place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class SumOpVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc &op_desc, BlockDesc *block) const override { + auto &inputs = op_desc.Input("X"); + auto default_var_type = proto::VarType::SELECTED_ROWS; + + bool any_input_is_lod_tensor = std::any_of( + inputs.begin(), inputs.end(), [block](const std::string &name) { + return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR; + }); + if (any_input_is_lod_tensor) { + default_var_type = proto::VarType::LOD_TENSOR; + } + + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(default_var_type); + } +}; +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker, + paddle::framework::SumOpVarTypeInference); +REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP, + paddle::framework::SumOpMaker); + +namespace paddle { +namespace framework { + +TEST(GraphTest, Basic) { + ProgramDesc prog; + auto *op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"test_a", "test_b", "test_c"}); + op->SetOutput("Out", {"test_out"}); + + prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarType::SELECTED_ROWS); + prog.MutableBlock(0)->Var("test_out"); + + op->InferVarType(prog.MutableBlock(0)); + + ASSERT_EQ(proto::VarType::SELECTED_ROWS, + prog.MutableBlock(0)->Var("test_out")->GetType()); + + prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarType::LOD_TENSOR); + op->InferVarType(prog.MutableBlock(0)); + ASSERT_EQ(proto::VarType::LOD_TENSOR, + prog.MutableBlock(0)->Var("test_out")->GetType()); + + std::unique_ptr g(new Graph(prog)); + ASSERT_EQ(g->nodes[0]->Name(), "sum"); + ASSERT_EQ(g->nodes[0]->inputs[0]->Name(), "test_a"); + ASSERT_EQ(g->nodes[0]->inputs[1]->Name(), "test_b"); + ASSERT_EQ(g->nodes[0]->inputs[2]->Name(), "test_c"); + ASSERT_EQ(g->nodes[0]->outputs[0]->Name(), "test_out"); + ASSERT_EQ(g->nodes[1]->Name(), "test_a"); + ASSERT_EQ(g->nodes[1]->outputs[0]->Name(), "sum"); + ASSERT_EQ(g->nodes[2]->Name(), "test_b"); + ASSERT_EQ(g->nodes[2]->outputs[0]->Name(), "sum"); + ASSERT_EQ(g->nodes[3]->Name(), "test_c"); + ASSERT_EQ(g->nodes[3]->outputs[0]->Name(), "sum"); + ASSERT_EQ(g->nodes[4]->Name(), "test_out"); + ASSERT_EQ(g->nodes[4]->inputs[0]->Name(), "sum"); + ASSERT_EQ(g->nodes.size(), 5); +} +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc new file mode 100644 index 0000000000000000000000000000000000000000..86376e7e8bc8bee2ddbc18f7f24bcdd849a06cbf --- /dev/null +++ b/paddle/fluid/framework/ir/node.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/node.h" + +namespace paddle { +namespace framework {} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h new file mode 100644 index 0000000000000000000000000000000000000000..b98c29b81ddc2f57553b8fe76fcfeb0936ddd837 --- /dev/null +++ b/paddle/fluid/framework/ir/node.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Node { + public: + enum class Type { kOperation, kVariable }; + explicit Node(const std::string& name, Type type) + : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} + + explicit Node(VarDesc* var_desc) + : name_(var_desc->Name()), + var_desc_(var_desc), + op_desc_(nullptr), + type_(Type::kVariable) {} + + explicit Node(OpDesc* op_desc) + : name_(op_desc->Type()), + var_desc_(nullptr), + op_desc_(op_desc), + type_(Type::kOperation) {} + + Type NodeType() const { return type_; } + + std::string Name() const { return name_; } + + VarDesc* Var() { + PADDLE_ENFORCE(type_ == Type::kVariable); + return var_desc_; + } + OpDesc* Op() { + PADDLE_ENFORCE(type_ == Type::kOperation); + return op_desc_; + } + + std::vector inputs; + std::vector outputs; + + protected: + const std::string name_; + VarDesc* var_desc_; + OpDesc* op_desc_; + Type type_; + + private: + DISABLE_COPY_AND_ASSIGN(Node); +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..c05d7d0bb54c8ba5938e08f7e8dace8f607d7b89 --- /dev/null +++ b/paddle/fluid/framework/ir/pass.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework {} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h new file mode 100644 index 0000000000000000000000000000000000000000..f52ba788d55ddb9ed27baa3f6ff0a97e52370fe0 --- /dev/null +++ b/paddle/fluid/framework/ir/pass.h @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Pass { + public: + Pass() = default; + virtual ~Pass() {} + + virtual std::unique_ptr Apply(std::unique_ptr graph) const = 0; +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 9a72e1baa34274201c40bd83a7aace549a7fc6ae..1e5bba62b53025dacdbf2d74b35f266cf4e422c2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -18,6 +18,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/ir/graph.h" + #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -129,12 +131,11 @@ ParallelExecutor::ParallelExecutor( PADDLE_THROW("Not compiled with CUDA."); #endif } - builder_ = builder_factory.Create(); + std::unique_ptr graph(new Graph(main_program)); + graph = builder_->Apply(std::move(graph)); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, - builder_->Build(main_program))); - + exec_strategy, member_->local_scopes_, places, std::move(graph))); member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( exec_strategy, member_->local_scopes_, std::move(var_infos), member_->places_, std::move(member_->executor_))); diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index cdd67fdc929851979fe0a38afe1af74ec7321b8a..67d355d10d3c9e11b59c9ce9d208826523095459 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -19,10 +19,14 @@ function (inference_analysis_test TARGET) set(multiValueArgs SRCS) cmake_parse_arguments(analysis_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(mem_opt "") + if(WITH_GPU) + set(mem_opt "--fraction_of_gpu_memory_to_use=0.5") + endif() cc_test(${TARGET} SRCS "${analysis_test_SRCS}" DEPS analysis - ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model --fraction_of_gpu_memory_to_use=0.5) + ARGS --inference_model_dir=${PYTHON_TESTS_DIR}/book/word2vec.inference.model ${mem_opt}) set_tests_properties(${TARGET} PROPERTIES DEPENDS test_word2vec) endif(WITH_TESTING) endfunction(inference_analysis_test) diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 9d9e126e134deafc18f10f7299e81d92702e9ca0..786dc8e827806a9cea9dc01788fada2fd754b930 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -66,6 +66,7 @@ bool NativePaddlePredictor::Init( if (parent_scope) { scope_ = parent_scope; sub_scope_ = &(parent_scope->NewScope()); + PADDLE_ENFORCE_NOT_NULL(sub_scope_, "create sub scope fail"); } else { paddle::framework::InitDevices(false); scope_.reset(new paddle::framework::Scope()); @@ -102,7 +103,6 @@ bool NativePaddlePredictor::Init( NativePaddlePredictor::~NativePaddlePredictor() { if (sub_scope_) { - PADDLE_ENFORCE_NOT_NULL(scope_, "Should have parent scope!"); scope_->DeleteScope(sub_scope_); } } diff --git a/paddle/fluid/inference/api/high_level_api.md b/paddle/fluid/inference/api/high_level_api.md index eb92885052a453d8c837bbf6f6e984efb509332a..8b8b6916d7e2b1a2f9fd09e9dfd2fe5a332461f5 100644 --- a/paddle/fluid/inference/api/high_level_api.md +++ b/paddle/fluid/inference/api/high_level_api.md @@ -57,4 +57,4 @@ By specifying the engine kind and config, one can get a specific implementation. ## Reference - [paddle_inference_api.h](./paddle_inference_api.h) -- [some demos](./demo) +- [some demos](./demo_ci) diff --git a/paddle/fluid/inference/api/high_level_api_cn.md b/paddle/fluid/inference/api/high_level_api_cn.md index a57f015a4e44d43ee4e475cf606faa6f05e095fa..2fb914592cbcb1b0c3f2ef33ff9cf4c295e427b6 100644 --- a/paddle/fluid/inference/api/high_level_api_cn.md +++ b/paddle/fluid/inference/api/high_level_api_cn.md @@ -83,5 +83,5 @@ CHECK(predictor->Run(slots, &outputs)); ## 详细代码参考 -- [inference demos](./demo) -- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc) +- [inference demos](./demo_ci) +- [复杂单线程/多线程例子](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/test_api_impl.cc) diff --git a/paddle/fluid/platform/variant.h b/paddle/fluid/platform/variant.h index 45f60fc9d76560b133fa06198a24c7eaccc24088..dc9fad29f281a1c6ac300b48f9e600ff802a5752 100644 --- a/paddle/fluid/platform/variant.h +++ b/paddle/fluid/platform/variant.h @@ -38,6 +38,7 @@ limitations under the License. */ #endif #endif +#include #include #include #include diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 216c4666c0a311f93f29692b2ca1d17bf9dafab8..3e13e7b1ffebf92301df69084b058ca55783e578 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -248,15 +248,11 @@ PYBIND11_PLUGIN(core) { #endif }) .def("rows", [](SelectedRows &self) { -#ifndef PADDLE_WITH_CUDA - return self.rows(); -#else - auto rows = self.rows(); - std::vector new_rows; - new_rows.reserve(rows.size()); - std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows)); - return new_rows; -#endif + auto rows = self.rows(); + std::vector new_rows; + new_rows.reserve(rows.size()); + std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows)); + return new_rows; }); py::class_(m, "Variable", R"DOC(Variable Class. diff --git a/paddle/fluid/pybind/recordio.cc b/paddle/fluid/pybind/recordio.cc index 330d104e0a774d905e463566f85bd2e64a080190..f83b026d4d50772b969c4316964b70a68b27442b 100644 --- a/paddle/fluid/pybind/recordio.cc +++ b/paddle/fluid/pybind/recordio.cc @@ -30,7 +30,9 @@ class RecordIOWriter { public: RecordIOWriter(const std::string& filename, recordio::Compressor compressor, size_t max_num_record) - : stream_(filename), writer_(&stream_, compressor, max_num_record) {} + : closed_(false), + stream_(filename), + writer_(&stream_, compressor, max_num_record) {} void AppendTensor(const framework::LoDTensor& tensor) { tensors_.push_back(tensor); @@ -47,9 +49,17 @@ class RecordIOWriter { PADDLE_ENFORCE(tensors_.empty()); writer_.Flush(); stream_.close(); + closed_ = true; + } + + ~RecordIOWriter() { + if (!closed_) { + Close(); + } } private: + bool closed_; std::vector tensors_; std::ofstream stream_; recordio::Writer writer_; diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py index 9d05aeeb95c4f936cb773ece20407ecb32cbbf21..6259cc35b4f7bb781886bb5da9d16924831d7246 100644 --- a/python/paddle/dataset/mnist.py +++ b/python/paddle/dataset/mnist.py @@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size): for i in xrange(buffer_size): yield images[i, :], int(labels[i]) finally: - m.terminate() - l.terminate() + try: + m.terminate() + except: + pass + try: + l.terminate() + except: + pass return reader diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 329d3c5ace2ee59a1f7eb5cb52965720a290880e..9903047f749b942c50692ac7f9164fc1c91569b4 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -35,6 +35,7 @@ import io import evaluator import initializer import layers +import contrib import nets import optimizer import backward @@ -66,6 +67,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \ 'io', 'initializer', 'layers', + 'contrib', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/contrib/__init__.py b/python/paddle/fluid/contrib/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..12cd5d918e93181c6b7e328e6aee4ad941b0a0da --- /dev/null +++ b/python/paddle/fluid/contrib/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import decoder +from decoder import * + +__all__ = decoder.__all__ diff --git a/python/paddle/fluid/contrib/decoder/__init__.py b/python/paddle/fluid/contrib/decoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..22cfe692690a686f32eba34ee34b9193f0d5ba35 --- /dev/null +++ b/python/paddle/fluid/contrib/decoder/__init__.py @@ -0,0 +1,18 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import beam_search_decoder +from beam_search_decoder import * + +__all__ = beam_search_decoder.__all__ diff --git a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..ba6e13878291ad9f30e92f998767df6d8c6f32c3 --- /dev/null +++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py @@ -0,0 +1,838 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +This module provides a general beam search decoder API for RNN based decoders. +The purpose of this API is to allow users to highly customize the behavior +within their RNN decoder(vanilla RNN, LSTM, attention + LSTM, future etc.), +without using the low level API such as while ops. + +This API is still under active development and may change drastically. +""" + +import contextlib +import numpy as np + +from ... import layers +from ...framework import Variable +from ... import core +from ... import framework, unique_name +from ...layer_helper import LayerHelper + +__all__ = ['InitState', 'StateCell', 'TrainingDecoder', 'BeamSearchDecoder'] + + +class _DecoderType: + TRAINING = 1 + BEAM_SEARCH = 2 + + +class InitState(object): + """ + The initial hidden state object. The state objects holds a variable, and may + use it to initialize the hidden state cell of RNN. Usually used as input to + `StateCell` class. + + Args: + init (Variable): The initial variable of the hidden state. If set None, + the variable will be created as a tensor with constant value based + on `shape` and `value` param. + shape (tuple|list): If `init` is None, new Variable's shape. Default + None. + value (float): If `init` is None, new Variable's value. Default None. + init_boot (Variable): If provided, the initial variable will be created + with the same shape as this variable. + need_reorder (bool): If set true, the init will be sorted by its lod + rank within its batches. This should be used if `batch_size > 1`. + dtype (np.dtype|core.VarDesc.VarType|str): Data type of the initial + variable. + + Returns: + An initialized state object. + + Examples: + See `StateCell`. + """ + + def __init__(self, + init=None, + shape=None, + value=0.0, + init_boot=None, + need_reorder=False, + dtype='float32'): + if init is not None: + self._init = init + elif init_boot is None: + raise ValueError( + 'init_boot must be provided to infer the shape of InitState .\n') + else: + self._init = layers.fill_constant_batch_size_like( + input=init_boot, value=value, shape=shape, dtype=dtype) + + self._shape = shape + self._value = value + self._need_reorder = need_reorder + self._dtype = dtype + + @property + def value(self): + return self._init + + @property + def need_reorder(self): + return self._need_reorder + + +class _MemoryState(object): + def __init__(self, state_name, rnn_obj, init_state): + self._state_name = state_name # each is a rnn.memory + self._rnn_obj = rnn_obj + self._state_mem = self._rnn_obj.memory( + init=init_state.value, need_reorder=init_state.need_reorder) + + def get_state(self): + return self._state_mem + + def update_state(self, state): + self._rnn_obj.update_memory(self._state_mem, state) + + +class _ArrayState(object): + def __init__(self, state_name, block, init_state): + self._state_name = state_name + self._block = block + + self._state_array = self._block.create_var( + name=unique_name.generate('array_state_array'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=init_state.value.dtype) + + self._counter = self._block.create_var( + name=unique_name.generate('array_state_counter'), + type=core.VarDesc.VarType.LOD_TENSOR, + dtype='int64') + + # initialize counter + self._block.append_op( + type='fill_constant', + inputs={}, + outputs={'Out': [self._counter]}, + attrs={ + 'shape': [1], + 'dtype': self._counter.dtype, + 'value': float(0.0), + 'force_cpu': True + }) + + self._counter.stop_gradient = True + + # write initial state + block.append_op( + type='write_to_array', + inputs={'X': init_state.value, + 'I': self._counter}, + outputs={'Out': self._state_array}) + + def get_state(self): + state = layers.array_read(array=self._state_array, i=self._counter) + return state + + def update_state(self, state): + layers.increment(x=self._counter, value=1, in_place=True) + layers.array_write(state, array=self._state_array, i=self._counter) + + +class StateCell(object): + """ + The state cell class stores the hidden state of the RNN cell. A typical RNN + cell has one or more hidden states, and one or more step inputs. This class + allows you to defines the name of hidden states as well as step inputs, and + their associated variables. + + Args: + inputs (dict): A feeding dict of {name(str) : Variable}. It specifies + the names of step inputs for RNN cell, and the associated variables. + The variable could initially be None and set manually during each + RNN step. + states (dict): A feeding dict of {name(str) : InitState object}. It + specifies the names of hidden states and their initialized state. + out_state (str): A string that specifies the name of hidden state that + will be used to compute the score in beam search process. + name (str): The name of the RNN cell. Default None. + + Raises: + `ValueError`: If the initial state is not an instance of InitState, or + the out_state is not in the dict of states. + + Returns: + StateCell: The initialized StateCell object. + + Examples: + .. code-block:: python + hidden_state = InitState(init=encoder_out, need_reorder=True) + state_cell = StateCell( + inputs={'current_word': None}, + states={'h': hidden_state}, + out_state='h') + """ + + def __init__(self, inputs, states, out_state, name=None): + self._helper = LayerHelper('state_cell', name=name) + self._cur_states = {} + self._state_names = [] + for state_name, state in states.items(): + if not isinstance(state, InitState): + raise ValueError('state must be an InitState object.') + self._cur_states[state_name] = state + self._state_names.append(state_name) + self._inputs = inputs # inputs is place holder here + self._cur_decoder_obj = None + self._in_decoder = False + self._states_holder = {} + self._switched_decoder = False + self._state_updater = None + self._out_state = out_state + if self._out_state not in self._cur_states: + raise ValueError('out_state must be one state in states') + + def _enter_decoder(self, decoder_obj): + if self._in_decoder == True or self._cur_decoder_obj is not None: + raise ValueError('StateCell has already entered a decoder.') + self._in_decoder = True + self._cur_decoder_obj = decoder_obj + self._switched_decoder = False + + def _leave_decoder(self, decoder_obj): + if not self._in_decoder: + raise ValueError('StateCell not in decoder, ' + 'invalid leaving operation.') + + if self._cur_decoder_obj != decoder_obj: + raise ValueError('Inconsistent decoder object in StateCell.') + + self._in_decoder = False + self._cur_decoder_obj = None + self._switched_decoder = False + + def _switch_decoder(self): # lazy switch + if not self._in_decoder: + raise ValueError('StateCell must be enter a decoder.') + + if self._switched_decoder: + raise ValueError('StateCell already done switching.') + + for state_name in self._state_names: + if state_name not in self._states_holder: + state = self._cur_states[state_name] + + if not isinstance(state, InitState): + raise ValueError('Current type of state is %s, should be ' + 'an InitState object.' % type(state)) + + self._states_holder[state_name] = {} + + if self._cur_decoder_obj.type == _DecoderType.TRAINING: + self._states_holder[state_name][id(self._cur_decoder_obj)] \ + = _MemoryState(state_name, + self._cur_decoder_obj.dynamic_rnn, + state) + elif self._cur_decoder_obj.type == _DecoderType.BEAM_SEARCH: + self._states_holder[state_name][id(self._cur_decoder_obj)] \ + = _ArrayState(state_name, + self._cur_decoder_obj._parent_block(), + state) + else: + raise ValueError('Unknown decoder type, only support ' + '[TRAINING, BEAM_SEARCH]') + + # Read back, since current state should be LoDTensor + self._cur_states[state_name] = \ + self._states_holder[state_name][ + id(self._cur_decoder_obj)].get_state() + + self._switched_decoder = True + + def get_state(self, state_name): + """ + The getter of state object. Find the state variable by its name. + + Args: + state_name (str): A string of the state's name. + + Returns: + The associated state object. + """ + if self._in_decoder and not self._switched_decoder: + self._switch_decoder() + + if state_name not in self._cur_states: + raise ValueError( + 'Unknown state %s. Please make sure _switch_decoder() ' + 'invoked.' % state_name) + + return self._cur_states[state_name] + + def get_input(self, input_name): + """ + The getter of input variable. Find the input variable by its name. + + Args: + input_name (str): The string of the input's name. + + Returns: + The associated input variable. + """ + if input_name not in self._inputs or self._inputs[input_name] is None: + raise ValueError('Invalid input %s.' % input_name) + return self._inputs[input_name] + + def set_state(self, state_name, state_value): + """ + The setter of the state variable. Change the variable of the given + `state_name`. + + Args: + state_name (str): The name of the state to change. + state_value (Var): The variable of the new state. + """ + self._cur_states[state_name] = state_value + + def state_updater(self, updater): + """ + Set up the updater to update the hidden state every RNN step. The + behavior of updater could be customized by users. The updater should be + a function that takes a `StateCell` object as input and update the + hidden state within it. The hidden state could be accessed through + `get_state` method. + + Args: + updater (func): the updater to update the state cell. + """ + self._state_updater = updater + + def _decorator(state_cell): + if state_cell == self: + raise TypeError('Updater should only accept a StateCell object ' + 'as argument.') + updater(state_cell) + + return _decorator + + def compute_state(self, inputs): + """ + Provide the step input of RNN cell, and compute the new hidden state + with updater and give step input. + + Args: + inputs (dict): A feed dict, {name(str): Variable}. name should be + the names of step inputs for this RNN cell, and Variable should be + the associated variables. + + Examples: + .. code-block:: python + state_cell.compute_state(inputs={'x': current_word}) + """ + if self._in_decoder and not self._switched_decoder: + self._switch_decoder() + + for input_name, input_value in inputs.items(): + if input_name not in self._inputs: + raise ValueError('Unknown input %s. ' + 'Please make sure %s in input ' + 'place holder.' % (input_name, input_name)) + self._inputs[input_name] = input_value + self._state_updater(self) + + def update_states(self): + """ + Update and record state information after each RNN step. + """ + if self._in_decoder and not self._switched_decoder: + self._switched_decoder() + + for state_name, decoder_state in self._states_holder.items(): + if id(self._cur_decoder_obj) not in decoder_state: + raise ValueError('Unknown decoder object, please make sure ' + 'switch_decoder been invoked.') + decoder_state[id(self._cur_decoder_obj)].update_state( + self._cur_states[state_name]) + + def out_state(self): + """ + Get the output state variable. This must be called after update_states. + + Returns: + The output variable of the RNN cell. + """ + return self._cur_states[self._out_state] + + +class TrainingDecoder(object): + """ + A decoder that can only be used for training. The decoder could be + initialized with a `StateCell` object. The computation within the RNN cell + could be defined with decoder's block. + + Args: + state_cell (StateCell): A StateCell object that handles the input and + state variables. + name (str): The name of this decoder. Default None. + + Returns: + TrainingDecoder: The initialized TrainingDecoder object. + + Examples: + .. code-block:: python + decoder = TrainingDecoder(state_cell) + with decoder.block(): + current_word = decoder.step_input(trg_embedding) + decoder.state_cell.compute_state(inputs={'x': current_word}) + current_score = layers.fc(input=decoder.state_cell.get_state('h'), + size=32, + act='softmax') + decoder.state_cell.update_states() + decoder.output(current_score) + """ + BEFORE_DECODER = 0 + IN_DECODER = 1 + AFTER_DECODER = 2 + + def __init__(self, state_cell, name=None): + self._helper = LayerHelper('training_decoder', name=name) + self._status = TrainingDecoder.BEFORE_DECODER + self._dynamic_rnn = layers.DynamicRNN() + self._type = _DecoderType.TRAINING + self._state_cell = state_cell + self._state_cell._enter_decoder(self) + + @contextlib.contextmanager + def block(self): + """ + Define the behavior of the decoder for each RNN time step. + """ + if self._status != TrainingDecoder.BEFORE_DECODER: + raise ValueError('decoder.block() can only be invoked once') + self._status = TrainingDecoder.IN_DECODER + with self._dynamic_rnn.block(): + yield + self._status = TrainingDecoder.AFTER_DECODER + self._state_cell._leave_decoder(self) + + @property + def state_cell(self): + self._assert_in_decoder_block('state_cell') + return self._state_cell + + @property + def dynamic_rnn(self): + return self._dynamic_rnn + + @property + def type(self): + return self._type + + def step_input(self, x): + """ + Set the input variable as a step input to the RNN cell. For example, + in machine translation, each time step we read one word from the target + sentences, then the target sentence is a step input to the RNN cell. + + Args: + x (Variable): the variable to be used as step input. + + Returns: + Variable: The variable as input of current step. + + Examples: + .. code-block:: python + current_word = decoder.step_input(trg_embedding) + """ + self._assert_in_decoder_block('step_input') + return self._dynamic_rnn.step_input(x) + + def static_input(self, x): + """ + Set the input variable as a static input of RNN cell. In contrast to + step input, this variable will be used as a whole within the RNN decode + loop and will not be scattered into time steps. + + Args: + x (Variable): the variable to be used as static input. + + Returns: + Variable: The variable as input of current step. + + Examples: + .. code-block:: python + encoder_vec = decoder.static_input(encoded_vector) + """ + self._assert_in_decoder_block('static_input') + return self._dynamic_rnn.static_input(x) + + def __call__(self, *args, **kwargs): + """ + Get the output of RNN. This API should only be invoked after RNN.block() + + Returns: + Variable: The specified output of the RNN cell. + """ + if self._status != TrainingDecoder.AFTER_DECODER: + raise ValueError('Output of training decoder can only be visited ' + 'outside the block.') + return self._dynamic_rnn(*args, **kwargs) + + def output(self, *outputs): + """ + Set the output variable of the RNN cell. + + Args: + *outputs (Variables): a series of variables that treated as output + of the RNN cell. + + Examples: + .. code-block:: python + out = fluid.layers.fc(input=h, + size=32, + bias_attr=True, + act='softmax') + decoder.output(out) + """ + self._assert_in_decoder_block('output') + self._dynamic_rnn.output(*outputs) + + def _assert_in_decoder_block(self, method): + if self._status != TrainingDecoder.IN_DECODER: + raise ValueError('%s should be invoked inside block of ' + 'TrainingDecoder object.' % method) + + +class BeamSearchDecoder(object): + """ + A beam search decoder that can be used for inference. The decoder should be + initialized with a `StateCell` object. The decode process can be defined + within its block. + + Args: + state_cell (StateCell): A StateCell object that handles the input and + state variables. + init_ids (Variable): The init beam search token ids. + init_scores (Variable): The associated score of each id. + target_dict_dim (int): Size of dictionary. + word_dim (int): Word embedding dimension. + input_var_dict (dict): A feeding dict to feed the required input + variables to the state cell. It will be used by state_cell 's + compute method. Default empty. + topk_size (int): The topk size used for beam search. Default 50. + max_len (int): The maximum allowed length of the generated sentence. + Default 100. + beam_size (int): The beam width of beam search decode. Default 1. + end_id (int): The id of end token within beam search. + name (str): The name of this decoder. Default None. + + Returns: + BeamSearchDecoder: A initialized BeamSearchDecoder object. + + Examples: + .. code-block:: python + decoder = BeamSearchDecoder( + state_cell=state_cell, + init_ids=init_ids, + init_scores=init_scores, + target_dict_dim=target_dict_dim, + word_dim=word_dim, + init_var_dict={}, + topk_size=topk_size, + sparse_emb=IS_SPARSE, + max_len=max_length, + beam_size=beam_size, + end_id=1, + name=None + ) + decoder.decode() + translation_ids, translation_scores = decoder() + """ + BEFORE_BEAM_SEARCH_DECODER = 0 + IN_BEAM_SEARCH_DECODER = 1 + AFTER_BEAM_SEARCH_DECODER = 2 + + def __init__(self, + state_cell, + init_ids, + init_scores, + target_dict_dim, + word_dim, + input_var_dict={}, + topk_size=50, + sparse_emb=True, + max_len=100, + beam_size=1, + end_id=1, + name=None): + self._helper = LayerHelper('beam_search_decoder', name=name) + self._counter = layers.zeros(shape=[1], dtype='int64') + self._counter.stop_gradient = True + self._type = _DecoderType.BEAM_SEARCH + self._max_len = layers.fill_constant( + shape=[1], dtype='int64', value=max_len) + self._cond = layers.less_than( + x=self._counter, + y=layers.fill_constant( + shape=[1], dtype='int64', value=max_len)) + self._while_op = layers.While(self._cond) + self._state_cell = state_cell + self._state_cell._enter_decoder(self) + self._status = BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER + self._zero_idx = layers.fill_constant( + shape=[1], value=0, dtype='int64', force_cpu=True) + self._array_dict = {} + self._array_link = [] + self._ids_array = None + self._scores_array = None + self._beam_size = beam_size + self._end_id = end_id + + self._init_ids = init_ids + self._init_scores = init_scores + self._target_dict_dim = target_dict_dim + self._topk_size = topk_size + self._sparse_emb = sparse_emb + self._word_dim = word_dim + self._input_var_dict = input_var_dict + + @contextlib.contextmanager + def block(self): + """ + Define the behavior of the decoder for each RNN time step. + """ + if self._status != BeamSearchDecoder.BEFORE_BEAM_SEARCH_DECODER: + raise ValueError('block() can only be invoke once.') + + self._status = BeamSearchDecoder.IN_BEAM_SEARCH_DECODER + + with self._while_op.block(): + yield + with layers.Switch() as switch: + with switch.case(self._cond): + layers.increment(x=self._counter, value=1.0, in_place=True) + + for value, array in self._array_link: + layers.array_write( + x=value, i=self._counter, array=array) + + layers.less_than( + x=self._counter, y=self._max_len, cond=self._cond) + + self._status = BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER + self._state_cell._leave_decoder(self) + + @property + def type(self): + return self._type + + def early_stop(self): + """ + Stop the generation process in advance. Could be used as "break". + """ + layers.fill_constant( + shape=[1], value=0, dtype='bool', force_cpu=True, out=self._cond) + + def decode(self): + """ + Set up the computation within the decoder. Then you could call the + decoder to get the result of beam search decode. If you want to define + a more specific decoder, you could override this function. + + Examples: + .. code-block:: python + decoder.decode() + translation_ids, translation_scores = decoder() + """ + with self.block(): + prev_ids = self.read_array(init=self._init_ids, is_ids=True) + prev_scores = self.read_array( + init=self._init_scores, is_scores=True) + prev_ids_embedding = layers.embedding( + input=prev_ids, + size=[self._target_dict_dim, self._word_dim], + dtype='float32', + is_sparse=self._sparse_emb) + + feed_dict = {} + update_dict = {} + + for init_var_name, init_var in self._input_var_dict.items(): + if init_var_name not in self.state_cell._inputs: + raise ValueError('Variable ' + init_var_name + + ' not found in StateCell!\n') + + read_var = self.read_array(init=init_var) + update_dict[init_var_name] = read_var + feed_var_expanded = layers.sequence_expand(read_var, + prev_scores) + feed_dict[init_var_name] = feed_var_expanded + + for state_str in self._state_cell._state_names: + prev_state = self.state_cell.get_state(state_str) + prev_state_expanded = layers.sequence_expand(prev_state, + prev_scores) + self.state_cell.set_state(state_str, prev_state_expanded) + + for i, input_name in enumerate(self._state_cell._inputs): + if input_name not in feed_dict: + feed_dict[input_name] = prev_ids_embedding + + self.state_cell.compute_state(inputs=feed_dict) + current_state = self.state_cell.out_state() + current_state_with_lod = layers.lod_reset( + x=current_state, y=prev_scores) + scores = layers.fc(input=current_state_with_lod, + size=self._target_dict_dim, + act='softmax') + topk_scores, topk_indices = layers.topk(scores, k=self._topk_size) + accu_scores = layers.elementwise_add( + x=layers.log(x=topk_scores), + y=layers.reshape( + prev_scores, shape=[-1]), + axis=0) + selected_ids, selected_scores = layers.beam_search( + prev_ids, + prev_scores, + topk_indices, + accu_scores, + self._beam_size, + end_id=1, + level=0) + + with layers.Switch() as switch: + with switch.case(layers.is_empty(selected_ids)): + self.early_stop() + with switch.default(): + self.state_cell.update_states() + self.update_array(prev_ids, selected_ids) + self.update_array(prev_scores, selected_scores) + for update_name, var_to_update in update_dict.items(): + self.update_array(var_to_update, feed_dict[update_name]) + + def read_array(self, init, is_ids=False, is_scores=False): + """ + Read an array to get the decoded ids and scores generated by previous + RNN step. At the first step of RNN, the init variable mut be used to + initialize the array. + + Args: + init (Variable): The initial variable for first step usage. init + must be provided. + is_ids (bool): Specify whether the variable is an id. + is_scores (bool): Specify whether the variable is a score. + + Returns: + The associated variable generated during previous RNN steps. + + Examples: + .. code-block:: python + prev_ids = decoder.read_array(init=init_ids, is_ids=True) + prev_scores = decoder.read_array(init=init_scores, is_scores=True) + """ + self._assert_in_decoder_block('read_array') + + if is_ids and is_scores: + raise ValueError('Shouldn\'t mark current array be ids array and' + 'scores array at the same time.') + + if not isinstance(init, Variable): + raise TypeError('The input argument `init` must be a Variable.') + + parent_block = self._parent_block() + array = parent_block.create_var( + name=unique_name.generate('beam_search_decoder_array'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=init.dtype) + parent_block.append_op( + type='write_to_array', + inputs={'X': init, + 'I': self._zero_idx}, + outputs={'Out': array}) + + if is_ids: + self._ids_array = array + elif is_scores: + self._scores_array = array + + read_value = layers.array_read(array=array, i=self._counter) + self._array_dict[read_value.name] = array + return read_value + + def update_array(self, array, value): + """ + Store the value generated in current step in an array for each RNN step. + This array could be accessed by read_array method. + + Args: + array (Variable): The array to append the new variable to. + value (Variable): The newly generated value to be stored. + """ + self._assert_in_decoder_block('update_array') + + if not isinstance(array, Variable): + raise TypeError( + 'The input argument `array` of must be a Variable.') + if not isinstance(value, Variable): + raise TypeError('The input argument `value` of must be a Variable.') + + array = self._array_dict.get(array.name, None) + if array is None: + raise ValueError('Please invoke read_array before update_array.') + self._array_link.append((value, array)) + + def __call__(self): + """ + Run the decode process and return the final decode result. + + Returns: + A tuple of decoded (id, score) pairs. id is a Variable that holds + the generated tokens, and score is a Variable with the same shape + as id, holds the score for each generated token. + """ + if self._status != BeamSearchDecoder.AFTER_BEAM_SEARCH_DECODER: + raise ValueError('Output of BeamSearchDecoder object can ' + 'only be visited outside the block.') + return layers.beam_search_decode( + ids=self._ids_array, + scores=self._scores_array, + beam_size=self._beam_size, + end_id=self._end_id) + + @property + def state_cell(self): + self._assert_in_decoder_block('state_cell') + return self._state_cell + + def _parent_block(self): + """ + Getter of parent block. + + Returns: + The parent block of decoder. + """ + program = self._helper.main_program + parent_block_idx = program.current_block().parent_idx + if parent_block_idx < 0: + raise ValueError('Invalid block with index %d.' % parent_block_idx) + parent_block = program.block(parent_block_idx) + return parent_block + + def _assert_in_decoder_block(self, method): + if self._status != BeamSearchDecoder.IN_BEAM_SEARCH_DECODER: + raise ValueError('%s should be invoked inside block of ' + 'BeamSearchDecoder object.' % method) diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py index 9a09db25dc0e2c71772aa06e6d0cf993321612e4..fd278f45f1c1b71a1653c3b28ace8bca8e4b1545 100644 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py +++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py @@ -14,6 +14,7 @@ from __future__ import print_function import argparse import paddle.fluid as fluid +import paddle.fluid.core as core import paddle import sys import numpy @@ -134,4 +135,4 @@ def main(use_cuda): if __name__ == '__main__': # for use_cuda in (False, True): - main(use_cuda=True) + main(use_cuda=core.is_compiled_with_cuda()) diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index 49f549fa184037a64aa846f0d1d0e1b57db1f2ef..c471863920999a28cbede93a7965f07ee784f96d 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +import paddle.fluid.core as core import math import os import sys @@ -257,6 +258,8 @@ def inject_test_method(use_cuda, parallel, nn_type, combine): def inject_all_tests(): for use_cuda in (False, True): + if use_cuda and not core.is_compiled_with_cuda(): + continue for parallel in (False, True): for nn_type in ('mlp', 'conv'): inject_test_method(use_cuda, parallel, nn_type, True) diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 80e0692bc640efc280c43bd5b929847ad29207c4..3b957508ca1f11fea3bbc182dca7eaa938594cb6 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -245,7 +245,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel): is_sparse=is_sparse, is_parallel=is_parallel) - if use_cuda and is_sparse: + if (not fluid.core.is_compiled_with_cuda() or use_cuda) and is_sparse: fn = __impl__ else: # skip the other test when on CI server diff --git a/python/paddle/fluid/tests/test_beam_search_decoder.py b/python/paddle/fluid/tests/test_beam_search_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..7a2502fa2f9733a7280e8e8d884b61b419719492 --- /dev/null +++ b/python/paddle/fluid/tests/test_beam_search_decoder.py @@ -0,0 +1,265 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +A simple machine translation demo using beam search decoder. +""" + +import contextlib +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.framework as framework +import paddle.fluid.layers as layers +from paddle.fluid.executor import Executor +from paddle.fluid.contrib.decoder.beam_search_decoder import * +import unittest +import os + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) +hidden_dim = 32 +word_dim = 32 +decoder_size = hidden_dim +IS_SPARSE = True +batch_size = 2 +max_length = 8 +topk_size = 50 +trg_dic_size = 10000 +beam_size = 2 + + +def encoder(): + # encoder + src_word = layers.data( + name="src_word", shape=[1], dtype='int64', lod_level=1) + src_embedding = layers.embedding( + input=src_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE) + + fc1 = layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = layers.sequence_last_step(input=lstm_hidden0) + return encoder_out + + +def decoder_state_cell(context): + h = InitState(init=context, need_reorder=True) + state_cell = StateCell(inputs={'x': None}, states={'h': h}, out_state='h') + + @state_cell.state_updater + def updater(state_cell): + current_word = state_cell.get_input('x') + prev_h = state_cell.get_state('h') + # make sure lod of h heritted from prev_h + h = layers.fc(input=[prev_h, current_word], + size=decoder_size, + act='tanh') + state_cell.set_state('h', h) + + return state_cell + + +def decoder_train(state_cell): + # decoder + trg_language_word = layers.data( + name="target_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = layers.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE) + + decoder = TrainingDecoder(state_cell) + + with decoder.block(): + current_word = decoder.step_input(trg_embedding) + decoder.state_cell.compute_state(inputs={'x': current_word}) + current_score = layers.fc(input=decoder.state_cell.get_state('h'), + size=target_dict_dim, + act='softmax') + decoder.state_cell.update_states() + decoder.output(current_score) + + return decoder() + + +def decoder_decode(state_cell): + init_ids = layers.data( + name="init_ids", shape=[1], dtype="int64", lod_level=2) + init_scores = layers.data( + name="init_scores", shape=[1], dtype="float32", lod_level=2) + + decoder = BeamSearchDecoder( + state_cell=state_cell, + init_ids=init_ids, + init_scores=init_scores, + target_dict_dim=target_dict_dim, + word_dim=word_dim, + input_var_dict={}, + topk_size=topk_size, + sparse_emb=IS_SPARSE, + max_len=max_length, + beam_size=beam_size, + end_id=1, + name=None) + decoder.decode() + translation_ids, translation_scores = decoder() + + return translation_ids, translation_scores + + +def train_main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + context = encoder() + state_cell = decoder_state_cell(context) + rnn_out = decoder_train(state_cell) + label = layers.data( + name="target_next_word", shape=[1], dtype='int64', lod_level=1) + cost = layers.cross_entropy(input=rnn_out, label=label) + avg_cost = layers.mean(x=cost) + + optimizer = fluid.optimizer.Adagrad(learning_rate=1e-3) + optimizer.minimize(avg_cost) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + feed_order = ['src_word', 'target_word', 'target_next_word'] + + exe = Executor(place) + + def train_loop(main_program): + exe.run(framework.default_startup_program()) + + feed_list = [ + main_program.global_block().var(var_name) for var_name in feed_order + ] + feeder = fluid.DataFeeder(feed_list, place) + + for pass_id in xrange(1): + for batch_id, data in enumerate(train_reader()): + outs = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + + " avg_cost=" + str(avg_cost_val)) + if batch_id > 3: + break + + train_loop(framework.default_main_program()) + + +def decode_main(use_cuda): + if use_cuda and not fluid.core.is_compiled_with_cuda(): + return + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + + context = encoder() + state_cell = decoder_state_cell(context) + translation_ids, translation_scores = decoder_decode(state_cell) + + exe = Executor(place) + exe.run(framework.default_startup_program()) + + init_ids_data = np.array([0 for _ in range(batch_size)], dtype='int64') + init_scores_data = np.array( + [1. for _ in range(batch_size)], dtype='float32') + init_ids_data = init_ids_data.reshape((batch_size, 1)) + init_scores_data = init_scores_data.reshape((batch_size, 1)) + init_lod = [1] * batch_size + init_lod = [init_lod, init_lod] + + init_ids = fluid.create_lod_tensor(init_ids_data, init_lod, place) + init_scores = fluid.create_lod_tensor(init_scores_data, init_lod, place) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(dict_size), buf_size=1000), + batch_size=batch_size) + + feed_order = ['src_word'] + feed_list = [ + framework.default_main_program().global_block().var(var_name) + for var_name in feed_order + ] + feeder = fluid.DataFeeder(feed_list, place) + + data = train_reader().next() + feed_dict = feeder.feed(map(lambda x: [x[0]], data)) + feed_dict['init_ids'] = init_ids + feed_dict['init_scores'] = init_scores + + result_ids, result_scores = exe.run( + framework.default_main_program(), + feed=feed_dict, + fetch_list=[translation_ids, translation_scores], + return_numpy=False) + print result_ids.lod() + + +class TestBeamSearchDecoder(unittest.TestCase): + pass + + +@contextlib.contextmanager +def scope_prog_guard(): + prog = fluid.Program() + startup_prog = fluid.Program() + scope = fluid.core.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(prog, startup_prog): + yield + + +def inject_test_train(use_cuda): + f_name = 'test_{0}_train'.format('cuda' if use_cuda else 'cpu') + + def f(*args): + with scope_prog_guard(): + train_main(use_cuda) + + setattr(TestBeamSearchDecoder, f_name, f) + + +def inject_test_decode(use_cuda, decorator=None): + f_name = 'test_{0}_decode'.format('cuda' if use_cuda else 'cpu', 'sparse') + + def f(*args): + with scope_prog_guard(): + decode_main(use_cuda) + + if decorator is not None: + f = decorator(f) + + setattr(TestBeamSearchDecoder, f_name, f) + + +for _use_cuda_ in (False, True): + inject_test_train(_use_cuda_) + +for _use_cuda_ in (False, True): + _decorator_ = None + inject_test_decode(use_cuda=_use_cuda_, decorator=_decorator_) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index f6c8dcabcbc592024188f4742e6c532a704d2289..8c564abf986d351b31e993b62bfe1f17c52a4e10 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -12,6 +12,11 @@ endif(NOT WITH_MKLDNN) if(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_recv_op) + list(REMOVE_ITEM TEST_OPS test_dist_transpiler) + list(REMOVE_ITEM TEST_OPS test_simple_dist_transpiler) + list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op) + LIST(REMOVE_ITEM TEST_OPS test_dist_mnist) + LIST(REMOVE_ITEM TEST_OPS test_dist_word2vec) endif(NOT WITH_DISTRIBUTE) list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 @@ -47,9 +52,11 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL) -py_test_modules(test_dist_train MODULES test_dist_train SERIAL) +if(WITH_DISTRIBUTE) + py_test_modules(test_dist_train MODULES test_dist_train SERIAL) + set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) + set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180) + set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180) +endif() py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL) py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL) -set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20) -set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180) -set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180) diff --git a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py index db5771f7b0ad74c73b81d502209c17dce3ce8457..4a3ac2a31e072eb1a15af31f558cf9f626a7ac53 100644 --- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py +++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py @@ -100,6 +100,8 @@ class TestBeamSearchDecodeOp(unittest.TestCase): np.array_equal(np.array(sentence_scores), expected_data)) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestBeamSearchDecodeOpGPU(TestBeamSearchDecodeOp): def setUp(self): self.scope = core.Scope() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index 07545e7feb46c85a4b80f9b846be27d36cbfb59a..af6cd99b0d7e6b0a2dfd4fc1d33e8390017a5906 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -191,12 +191,16 @@ class TestWithDilation(TestConv2dTransposeOp): # ------------ test_cudnn ------------ +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestCUDNN(TestConv2dTransposeOp): def init_op_type(self): self.use_cudnn = True self.op_type = "conv2d_transpose" +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestCUDNNWithPad(TestWithPad): def init_test_case(self): self.pad = [1, 1] @@ -212,6 +216,8 @@ class TestCUDNNWithPad(TestWithPad): self.op_type = "conv2d_transpose" +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestCUDNNWithStride(TestWithStride): def init_test_case(self): self.pad = [1, 1] @@ -227,6 +233,8 @@ class TestCUDNNWithStride(TestWithStride): self.op_type = "conv2d_transpose" +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestCUDNNWithGroups(TestWithGroups): def init_test_case(self): self.pad = [1, 1] diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py index c9f26d10df8ff39d6bd77b1597336600f676d362..300fa5e8bde001e0f66c5f924a81c30add99aead 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py @@ -197,12 +197,16 @@ class TestWithDilation(TestConv3dTransposeOp): # ------------ test_cudnn ------------ +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestCUDNN(TestConv3dTransposeOp): def init_op_type(self): self.use_cudnn = True self.op_type = "conv3d_transpose" +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestCUDNNWithPad(TestWithPad): def init_test_case(self): self.pad = [1, 1, 1] @@ -218,6 +222,8 @@ class TestCUDNNWithPad(TestWithPad): self.op_type = "conv3d_transpose" +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestCUDNNWithStride(TestWithStride): def init_test_case(self): self.pad = [1, 1, 1] @@ -233,6 +239,8 @@ class TestCUDNNWithStride(TestWithStride): self.op_type = "conv3d_transpose" +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestCUDNNWithGroups(TestWithGroups): def init_test_case(self): self.pad = [1, 1, 1] diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py index 1f5d2f16773efb7537de85abec88344f8e0daa9f..60d63364d5f403f04519363db5ad3ad136f8a975 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py @@ -15,6 +15,7 @@ import paddle.dataset.flowers as flowers import math import paddle.fluid as fluid +import paddle.fluid.core as core import unittest import numpy as np import paddle @@ -92,7 +93,8 @@ class TestFetchOp(unittest.TestCase): train_inputs.append(tst_reader_iter.next()) os.environ['CPU_NUM'] = str(4) - self.parallel_exe(train_inputs, seed=1, use_cuda=True) + if core.is_compiled_with_cuda(): + self.parallel_exe(train_inputs, seed=1, use_cuda=True) self.parallel_exe(train_inputs, seed=1, use_cuda=False) @@ -137,7 +139,8 @@ class TestFeedParallel(unittest.TestCase): def test_feed_op(self): os.environ['CPU_NUM'] = str(4) - self.parallel_exe(use_cuda=True, seed=1) + if core.is_compiled_with_cuda(): + self.parallel_exe(use_cuda=True, seed=1) self.parallel_exe(use_cuda=False, seed=1) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index ccd1b1c122d53fc1d8a2c4d7869e4a2164260906..848a3f8b87d5a8aaa4a548c7dbe51f7a1f862fc5 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -14,6 +14,7 @@ from parallel_executor_test_base import TestParallelExecutorBase import paddle.fluid as fluid +import paddle.fluid.core as core import numpy as np import paddle import paddle.dataset.mnist as mnist @@ -98,6 +99,8 @@ class TestMNIST(TestParallelExecutorBase): MNIST_RECORDIO_FILE, reader, feeder) def check_simple_fc_convergence(self, use_cuda, use_reduce=False): + if use_cuda and not core.is_compiled_with_cuda(): + return self.check_network_convergence(simple_fc_net, use_cuda=use_cuda) self.check_network_convergence( simple_fc_net, use_cuda=use_cuda, allow_op_delay=True) @@ -122,6 +125,8 @@ class TestMNIST(TestParallelExecutorBase): self.check_simple_fc_convergence(False, True) def check_simple_fc_parallel_accuracy(self, use_cuda, use_reduce=False): + if use_cuda and not core.is_compiled_with_cuda(): + return img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') single_first_loss, single_last_loss = self.check_network_convergence( @@ -155,6 +160,8 @@ class TestMNIST(TestParallelExecutorBase): self.check_simple_fc_parallel_accuracy(False, True) def check_batchnorm_fc_convergence(self, use_cuda, use_reduce=False): + if use_cuda and not core.is_compiled_with_cuda(): + return self.check_network_convergence(fc_with_batchnorm, use_cuda=use_cuda) img = np.zeros(shape=[32, 784], dtype='float32') label = np.ones(shape=[32, 1], dtype='int64') diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 4d39505b66abf44249e0ea160b82aaf7be0638cb..834e920845f29b153909a971eb5afc4f8a33346e 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -16,6 +16,7 @@ import paddle.fluid as fluid import paddle.fluid.layers.ops as ops from paddle.fluid.initializer import init_on_cpu from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter +import paddle.fluid.core as core from parallel_executor_test_base import TestParallelExecutorBase import unittest import math @@ -140,6 +141,9 @@ class TestResnet(TestParallelExecutorBase): use_reduce=False, iter=20): + if use_cuda and not core.is_compiled_with_cuda(): + return + os.environ['CPU_NUM'] = str(4) def _cosine_decay(learning_rate, step_each_epoch, epochs=120): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py index 9a2733927d38f1a2b1af92fcc12f036158b4d06f..7688b8495d7f7c60e80f62dae2edd72be9f839d4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py @@ -13,6 +13,7 @@ # limitations under the License. import paddle.fluid as fluid +import paddle.fluid.core as core import numpy as np import unittest import os @@ -92,16 +93,18 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase): def test_parallel_testing(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce - self.check_network_convergence( - use_cuda=True, build_strategy=build_strategy) + if core.is_compiled_with_cuda(): + self.check_network_convergence( + use_cuda=True, build_strategy=build_strategy) self.check_network_convergence( use_cuda=False, build_strategy=build_strategy) def test_parallel_testing_with_new_strategy(self): build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce - self.check_network_convergence( - use_cuda=True, build_strategy=build_strategy) + if core.is_compiled_with_cuda(): + self.check_network_convergence( + use_cuda=True, build_strategy=build_strategy) self.check_network_convergence( use_cuda=False, build_strategy=build_strategy) diff --git a/python/paddle/fluid/tests/unittests/test_print_op.py b/python/paddle/fluid/tests/unittests/test_print_op.py index e01af42a58b86042fd0282928d1a78d9c3239fe3..b461c5c9401d74ef8dcf4afc84dc0ea6920a2419 100644 --- a/python/paddle/fluid/tests/unittests/test_print_op.py +++ b/python/paddle/fluid/tests/unittests/test_print_op.py @@ -56,6 +56,8 @@ class TestPrintOpCPU(unittest.TestCase): return_numpy=False) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestPrintOpGPU(TestPrintOpCPU): def setUp(self): self.place = core.CUDAPlace(0) diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index cf6fe14a86aa1ab6ea3f60ad15f33d708e9b803a..9f8d33f9bbfc78b6f1a0c089b34b2f41d561c640 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -79,12 +79,18 @@ class TestProfiler(unittest.TestCase): pass_acc_calculator.add(value=acc, weight=b_size) pass_acc = pass_acc_calculator.eval() + @unittest.skipIf(not core.is_compiled_with_cuda(), + "profiler is enabled only with GPU") def test_cpu_profiler(self): self.net_profiler('CPU') + @unittest.skipIf(not core.is_compiled_with_cuda(), + "profiler is enabled only with GPU") def test_cuda_profiler(self): self.net_profiler('GPU') + @unittest.skipIf(not core.is_compiled_with_cuda(), + "profiler is enabled only with GPU") def test_all_profiler(self): self.net_profiler('All', '/tmp/profile_out') with open('/tmp/profile_out', 'r') as f: diff --git a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py index e91a69a0f8039651225039beb2a42e8dffeb62d3..c4fc8b74cf80c3596b0af9f7f0434864591195bd 100644 --- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py @@ -61,6 +61,8 @@ class TestSequenceSoftmaxOp(OpTest): # ----------------cudnn Sequencesoftmax---------------- +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestSequenceSoftmaxCUDNNOp(TestSequenceSoftmaxOp): def init_op_type(self): self.use_cudnn = True diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py index 279f3073f73d1c36f54bb901d92441a7403ac23f..0ab581cfb0ea0ff2205450b8e62edb8bf3c51707 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py @@ -63,11 +63,15 @@ class TestSoftmaxOp(OpTest): self.check_grad(["X"], "Out", max_relative_error=0.01) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestSoftmaxCUDNNOp(TestSoftmaxOp): def init_kernel_type(self): self.use_cudnn = True +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestSoftmaxFP16Op(TestSoftmaxOp): def init_kernel_type(self): self.dtype = np.float16 @@ -79,6 +83,8 @@ class TestSoftmaxFP16Op(TestSoftmaxOp): self.check_output_with_place(place, atol=1e-3) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestSoftmaxFP16CUDNNOp(TestSoftmaxOp): def init_kernel_type(self): self.use_cudnn = True diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py index 2b959c48e4bc62e08f6f57981b61b7c5fe3a1d06..026cf501cfb35ab3fe35d24f52d3c271565482ef 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/v2/dataset/mnist.py @@ -68,8 +68,14 @@ def reader_creator(image_filename, label_filename, buffer_size): for i in xrange(buffer_size): yield images[i, :], int(labels[i]) finally: - m.terminate() - l.terminate() + try: + m.terminate() + except: + pass + try: + l.terminate() + except: + pass return reader diff --git a/python/setup.py.in b/python/setup.py.in index a81cd19e10153be0d07badfa0c0fbcb01fe568f7..4a6cddbbea4903f5a65123aa19b7e978b335f32b 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -104,6 +104,8 @@ packages=['paddle', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', + 'paddle.fluid.contrib', + 'paddle.fluid.contrib.decoder', 'paddle.fluid.transpiler', 'paddle.fluid.transpiler.details']