diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index bda2069d14307e4d2f9a0e369ef8b9d16911fdb6..88a2c740e08f7c2c6ac831c65a0ef992064b3a61 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -174,6 +174,7 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None)) diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 1a2a9ac328c4a9b89bfb89106af81b9fb3ed3028..4305eb65733a7c871450949ce2c48cab013bac81 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -37,8 +37,9 @@ struct TestBroadcastOpHandle { std::vector local_scopes_; std::vector param_scopes_; Scope g_scope_; - std::unique_ptr op_handle_; - std::vector> vars_; + OpHandleBase* op_handle_; + std::vector vars_; + std::vector> nodes_; std::vector place_list_; bool use_gpu_; #ifdef PADDLE_WITH_CUDA @@ -90,6 +91,7 @@ struct TestBroadcastOpHandle { } void InitBroadcastOp(size_t input_scope_idx) { + nodes_.clear(); for (size_t j = 0; j < place_list_.size(); ++j) { local_scopes_.push_back(&(g_scope_.NewScope())); Scope& local_scope = local_scopes_.back()->NewScope(); @@ -101,39 +103,39 @@ struct TestBroadcastOpHandle { } param_scopes_[input_scope_idx]->Var("input"); - std::unique_ptr n = - ir::CreateNodeForTest("node0", ir::Node::Type::kOperation); + nodes_.emplace_back( + ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); if (use_gpu_) { #ifdef PADDLE_WITH_CUDA - op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, - place_list_, nccl_ctxs_.get())); + op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, + place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not support."); #endif } else { #ifdef PADDLE_WITH_CUDA - op_handle_.reset(new BroadcastOpHandle(n.get(), local_scopes_, - place_list_, nccl_ctxs_.get())); + op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, + place_list_, nccl_ctxs_.get()); #else - op_handle_.reset( - new BroadcastOpHandle(n.get(), local_scopes_, place_list_)); + op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, + place_list_); #endif } - std::unique_ptr v = - ir::CreateNodeForTest("node1", ir::Node::Type::kVariable); - auto* in_var_handle = new VarHandle(v.get(), 1, input_scope_idx, "input", - place_list_[input_scope_idx]); + nodes_.emplace_back( + ir::CreateNodeForTest("node1", ir::Node::Type::kVariable)); + auto* in_var_handle = new VarHandle(nodes_.back().get(), 1, input_scope_idx, + "input", place_list_[input_scope_idx]); vars_.emplace_back(in_var_handle); op_handle_->AddInput(in_var_handle); // add dummy var - std::unique_ptr v2 = - ir::CreateNodeForTest("node2", ir::Node::Type::kVariable); - vars_.emplace_back(new DummyVarHandle(v2.get())); + nodes_.emplace_back( + ir::CreateNodeForTest("node2", ir::Node::Type::kVariable)); + vars_.emplace_back(new DummyVarHandle(nodes_.back().get())); DummyVarHandle* dummy_var_handle = - static_cast(vars_.back().get()); + static_cast(vars_.back()); dummy_var_handle->ClearGeneratedOp(); op_handle_->AddInput(dummy_var_handle); @@ -141,20 +143,20 @@ struct TestBroadcastOpHandle { if (!use_gpu_) { op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get()); } - std::unique_ptr v3 = - ir::CreateNodeForTest("node3", ir::Node::Type::kVariable); + nodes_.emplace_back( + ir::CreateNodeForTest("node3", ir::Node::Type::kVariable)); VarHandle* out_var_handle = - new VarHandle(v3.get(), 2, j, "out", place_list_[j]); + new VarHandle(nodes_.back().get(), 2, j, "out", place_list_[j]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); } // add dummy var - std::unique_ptr v4 = - ir::CreateNodeForTest("node4", ir::Node::Type::kVariable); - vars_.emplace_back(new DummyVarHandle(v4.get())); + nodes_.emplace_back( + ir::CreateNodeForTest("node4", ir::Node::Type::kVariable)); + vars_.emplace_back(new DummyVarHandle(nodes_.back().get())); DummyVarHandle* out_dummy_var_handle = - static_cast(vars_.back().get()); + static_cast(vars_.back()); out_dummy_var_handle->ClearGeneratedOp(); op_handle_->AddOutput(out_dummy_var_handle); } diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 2b2329b9698908fdbe3385f1d555d756c47fc5c0..ca11c0083961f9b3d04e33113a0d685d508918f9 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -16,6 +16,7 @@ #include #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { @@ -32,13 +33,11 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( pool_(strategy.num_threads_ + 1), // add one more thread for generate op_deps fetch_ctxs_(places) { - auto &ops = graph_->Get("ops"); - - for (auto &op : ops) { + for (auto &op : ir::FilterByNodeWrapper(*graph_)) { int dep = static_cast(op->NotReadyInputSize()); - op_deps_.emplace(op.get(), dep); + op_deps_.emplace(op, dep); if (dep == 0) { - bootstrap_ops_.emplace_back(op.get()); + bootstrap_ops_.emplace_back(op); } } @@ -54,13 +53,13 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( paddle::framework::FeedFetchList fetches; fetches.resize(fetch_tensors.size()); std::unordered_map> fetched_vars; - std::vector> fetch_ops; + std::vector fetch_ops; for (auto &fetch_var_name : fetch_tensors) { for (auto &var_map : graph_->Get("vars")) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { - fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get()); + fetched_vars[fetch_var_name].push_back(*it->second.rbegin()); } } } @@ -110,7 +109,10 @@ FeedFetchList FastThreadedSSAGraphExecutor::Run( complete_q->Pop(); } } - exception_.ReThrow(); + if (exception_.IsCaught()) { + ClearFetchOp(graph_.get(), &fetch_ops); + exception_.ReThrow(); + } } num_complete += num_comp; } diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index fe18b2060c5cd7e157374da53c5a985f70545ab7..648adae06facb504042d8286f6eab5d98e99c015 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -28,11 +28,7 @@ FetchOpHandle::FetchOpHandle(ir::Node *node, FeedFetchList *data, size_t offset, offset_(offset), local_scopes_(local_scopes) {} -FetchOpHandle::~FetchOpHandle() { - for (auto *input_var : inputs_) { - input_var->RemoveOutput(this, this->Node()); - } -} +FetchOpHandle::~FetchOpHandle() {} void FetchOpHandle::RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx) { PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index 0f12bd2b4e857648342aeb5ad33b6c0fe01c9c73..541993c74332cc483a8b854a6b8f227c7c9a19a9 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -22,8 +22,10 @@ namespace details { struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { std::vector out_varnames_; + std::vector> nodes_; void InitFusedBroadcastOp(std::vector input_scope_idxes) { + nodes_.clear(); // initialize scope and var for (size_t i = 0; i < place_list_.size(); ++i) { local_scopes_.push_back(&(g_scope_.NewScope())); @@ -39,41 +41,41 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { } // create op handle node - std::unique_ptr n = - ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation); + nodes_.emplace_back( + ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); if (use_gpu_) { #ifdef PADDLE_WITH_CUDA - op_handle_.reset(new FusedBroadcastOpHandle( - n.get(), local_scopes_, place_list_, nccl_ctxs_.get())); + op_handle_ = new FusedBroadcastOpHandle( + nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not supported."); #endif } else { #ifdef PADDLE_WITH_CUDA - op_handle_.reset(new FusedBroadcastOpHandle( - n.get(), local_scopes_, place_list_, nccl_ctxs_.get())); + op_handle_ = new FusedBroadcastOpHandle( + nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else - op_handle_.reset( - new FusedBroadcastOpHandle(n.get(), local_scopes_, place_list_)); + op_handle_ = new FusedBroadcastOpHandle(nodes_.back().get(), + local_scopes_, place_list_); #endif } for (size_t i = 0; i < input_scope_idxes.size(); ++i) { // add input var handle - std::unique_ptr in_node = - ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable); + nodes_.emplace_back( + ir::CreateNodeForTest("in_node" + i, ir::Node::Type::kVariable)); VarHandle* in_var_handle = - new VarHandle(in_node.get(), 1, input_scope_idxes[i], "in_var" + i, - place_list_[input_scope_idxes[i]]); + new VarHandle(nodes_.back().get(), 1, input_scope_idxes[i], + "in_var" + i, place_list_[input_scope_idxes[i]]); vars_.emplace_back(in_var_handle); op_handle_->AddInput(in_var_handle); // add output var handle for (size_t j = 0; j < place_list_.size(); ++j) { - std::unique_ptr out_node = - ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable); - VarHandle* out_var_handle = - new VarHandle(out_node.get(), 2, j, "out_var" + i, place_list_[j]); + nodes_.emplace_back( + ir::CreateNodeForTest("out_node" + i, ir::Node::Type::kVariable)); + VarHandle* out_var_handle = new VarHandle( + nodes_.back().get(), 2, j, "out_var" + i, place_list_[j]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); } diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index ed67e88ff6a7fe9efd93e5dfd4d7bdf4c43aac2e..e8cb7feb8bea92a7486b8a9d84ba4b9e2b93dbfb 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -31,9 +31,10 @@ struct TestGatherOpHandle { std::vector local_scopes_; std::vector param_scopes_; Scope g_scope_; - std::unique_ptr op_handle_; - std::vector> vars_; + OpHandleBase* op_handle_; + std::vector vars_; std::vector gpu_list_; + std::vector> nodes_; void WaitAll() { for (size_t j = 0; j < ctxs_.size(); ++j) { @@ -70,7 +71,7 @@ struct TestGatherOpHandle { } void InitGatherOp(size_t input_scope_idx) { - std::vector> nodes; + nodes_.clear(); for (size_t j = 0; j < gpu_list_.size(); ++j) { local_scopes_.push_back(&(g_scope_.NewScope())); Scope& local_scope = local_scopes_.back()->NewScope(); @@ -82,44 +83,45 @@ struct TestGatherOpHandle { } param_scopes_[input_scope_idx]->Var("out"); - nodes.emplace_back( + nodes_.emplace_back( ir::CreateNodeForTest("node", ir::Node::Type::kOperation).release()); - op_handle_.reset( - new GatherOpHandle(nodes.back().get(), local_scopes_, gpu_list_)); + op_handle_ = + new GatherOpHandle(nodes_.back().get(), local_scopes_, gpu_list_); // add input for (size_t j = 0; j < gpu_list_.size(); ++j) { op_handle_->SetDeviceContext(gpu_list_[j], ctxs_[j].get()); - nodes.emplace_back( + nodes_.emplace_back( ir::CreateNodeForTest("node1", ir::Node::Type::kVariable).release()); auto* in_var_handle = - new VarHandle(nodes.back().get(), 1, j, "input", gpu_list_[j]); + new VarHandle(nodes_.back().get(), 1, j, "input", gpu_list_[j]); vars_.emplace_back(in_var_handle); op_handle_->AddInput(in_var_handle); } // add dummy var - nodes.emplace_back( + nodes_.emplace_back( ir::CreateNodeForTest("node2", ir::Node::Type::kVariable).release()); - vars_.emplace_back(new DummyVarHandle(nodes.back().get())); + vars_.emplace_back(new DummyVarHandle(nodes_.back().get())); DummyVarHandle* in_dummy_var_handle = - static_cast(vars_.back().get()); + static_cast(vars_.back()); in_dummy_var_handle->ClearGeneratedOp(); op_handle_->AddInput(in_dummy_var_handle); // add output - nodes.emplace_back( + nodes_.emplace_back( ir::CreateNodeForTest("node3", ir::Node::Type::kVariable).release()); - auto* out_var_handle = new VarHandle(nodes.back().get(), 2, input_scope_idx, - "out", gpu_list_[input_scope_idx]); + auto* out_var_handle = + new VarHandle(nodes_.back().get(), 2, input_scope_idx, "out", + gpu_list_[input_scope_idx]); vars_.emplace_back(out_var_handle); op_handle_->AddOutput(out_var_handle); // add dummy var - nodes.emplace_back( + nodes_.emplace_back( ir::CreateNodeForTest("node4", ir::Node::Type::kVariable).release()); - vars_.emplace_back(new DummyVarHandle(nodes.back().get())); + vars_.emplace_back(new DummyVarHandle(nodes_.back().get())); DummyVarHandle* dummy_var_handle = - static_cast(vars_.back().get()); + static_cast(vars_.back()); op_handle_->AddOutput(dummy_var_handle); } diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc index a3ecd589aa87b59ca9d423ea26b476b4816b71b7..bf3f3637b551a8a8084e6e4f1ca6a94b65361f17 100644 --- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc +++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc @@ -16,6 +16,7 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/op_graph_view.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { @@ -35,10 +36,10 @@ static bool IsLockAndRecordEventFreeComputationOpHandle( std::unique_ptr ModifyOpLockAndRecordEventPass::ApplyImpl( std::unique_ptr ir_graph) const { - auto &all_ops = ir_graph->Get(kGraphOps); + auto all_ops = ir::FilterByNodeWrapper(*ir_graph); OpGraphView graph_view(all_ops); for (auto &op : all_ops) { - auto *compute_op = dynamic_cast(op.get()); + auto *compute_op = dynamic_cast(op); if (compute_op == nullptr) continue; bool is_lock_and_record_event_free = IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view); diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc index c9c255864a2477ed29873f8521acce37fa928c06..c8ea18804630fea4ada98062256730dbf4c24860 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { @@ -36,20 +37,20 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { - insert_pending_var(version_pair.get()); + insert_pending_var(version_pair); } } } for (auto &var : graph->Get(kGraphDepVars)) { - insert_pending_var(var.get()); + insert_pending_var(var); } - for (auto &op : graph->Get(kGraphOps)) { + for (OpHandleBase *op : ir::FilterByNodeWrapper(*graph)) { if (op->Inputs().empty()) { - ready_ops.insert(op.get()); + ready_ops.insert(op); } else { - pending_ops.insert({op.get(), op.get()->NoDupInputSize()}); + pending_ops.insert({op, op->NoDupInputSize()}); } } @@ -89,6 +90,4 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { REGISTER_PASS(multi_devices_check_pass, paddle::framework::details::SSAGraghBuilderWithChecker) .RequireGraphAttr(paddle::framework::details::kGraphVars) - .RequireGraphAttr(paddle::framework::details::kGraphDepVars) - .RequireGraphAttr(paddle::framework::details::kGraphOps) - .RequireGraphAttr(paddle::framework::details::kShardedVarDevice); + .RequireGraphAttr(paddle::framework::details::kGraphDepVars); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 2ead651c6eb63f3485eac983ea64af387df36ab6..8b2acc7caffec076a5d63fc3630430a61eaab493 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -34,7 +34,14 @@ namespace paddle { namespace framework { namespace details { + namespace { +// TODO(panyx0718): Clean this up as well. +// all operators. NOTE that even we use a vector here, the operators is +// unordered. +typedef std::vector GraphOps; +const char kGraphOps[] = "ops"; + void PolishGraphToSupportDataHazards(ir::Graph *graph) { for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { @@ -92,7 +99,7 @@ VarHandle *CreateOrGetLatestVarHandle(ir::Graph *graph, ir::Node *node, } var_holder.emplace_back(var); } else { - var = var_holder.rbegin()->get(); + var = *var_holder.rbegin(); } return var; } @@ -154,7 +161,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, ir::Node *node, size_t place_id) const { auto p = places_[place_id]; - auto *op_handle = result->Get(kGraphOps).back().get(); + auto *op_handle = result->Get(kGraphOps).back(); op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -303,7 +310,6 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( result.Set(kGraphVars, new GraphVars(places_.size())); result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphOps, new GraphOps); - result.Set(kShardedVarDevice, new ShardedVarDevice); // find send/recv vars so that we can place the distributed training // related op in the place 0 @@ -317,11 +323,13 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( bool is_forwarding = true; bool is_dist_train = false; + std::unordered_map sharded_var_device; + for (ir::Node *node : sorted_ops) { if (boost::get( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kRPC)) { - int op_dev_id = CreateRPCOp(&result, node); + int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); PADDLE_ENFORCE(op_dev_id != -1, "Can not schedule the RPC operator to the right place."); if (node->Op()->Type() == "recv") { @@ -337,7 +345,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } else if (boost::get(node->Op()->GetAttr( OpProtoAndCheckerMaker::OpRoleAttrName())) == static_cast(OpRole::kDist)) { - int op_dev_id = CreateDistTrainOp(&result, node); + int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device); if (node->Op()->Type() == "concat") { auto origin_param_name = node->Op()->OutputArgumentNames()[0]; bcast_var_name_set[op_dev_id].emplace(origin_param_name); @@ -356,12 +364,11 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(result, node); + int op_dev_id = GetOpDeviceID(result, node, sharded_var_device); if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { - graph->Get(kShardedVarDevice) - .emplace(n->Name(), op_dev_id); + sharded_var_device.emplace(n->Name(), op_dev_id); } } else { // This op runs on all devices, and its output may have parameter's @@ -398,8 +405,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( case BuildStrategy::ReduceStrategy::kReduce: cur_device_id = GetAppropriateDeviceID({g_name}); CreateReduceOp(&result, g_name, cur_device_id); - graph->Get(kShardedVarDevice) - .emplace(g_name, cur_device_id); + sharded_var_device.emplace(g_name, cur_device_id); if (!is_dist_train) { bcast_var_name_set[cur_device_id].emplace(p_name); } @@ -458,7 +464,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( * Only variables should be the leaves of graph. */ AddOutputToLeafOps(&result); - PADDLE_ENFORCE(!ir::HasCircle(result)); + result.Erase(kGraphOps); return graph; } @@ -498,7 +504,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, result->Get(kGraphOps).emplace_back(op_handle); auto *in = - result->Get(kGraphVars).at(src_dev_id).at(p_name).back().get(); + result->Get(kGraphVars).at(src_dev_id).at(p_name).back(); op_handle->AddInput(in); for (size_t i = 0; i < places_.size(); ++i) { @@ -535,7 +541,7 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( for (size_t dev_id = 0; dev_id < bcast_varnames.size(); ++dev_id) { for (auto &p_name : bcast_varnames[dev_id]) { auto *in = - result->Get(kGraphVars).at(dev_id).at(p_name).back().get(); + result->Get(kGraphVars).at(dev_id).at(p_name).back(); op_handle->AddInput(in); for (size_t out_dev_id = 0; out_dev_id < places_.size(); ++out_dev_id) { auto &p = places_[out_dev_id]; @@ -571,7 +577,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get(kGraphOps).back().get(); + auto *op_handle = result->Get(kGraphOps).back(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; @@ -579,7 +585,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); - op_handle->AddInput(prev_grad.get()); + op_handle->AddInput(prev_grad); auto var = new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), @@ -600,14 +606,14 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get(kGraphOps).back().get(); + auto *op_handle = result->Get(kGraphOps).back(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); for (const std::string &d_name : datas) { auto &vars = result->Get(kGraphVars)[i][d_name]; PADDLE_ENFORCE(!vars.empty()); - op_handle->AddInput(vars.back().get()); + op_handle->AddInput(vars.back()); auto var = new VarHandle( result->CreateEmptyNode(d_name, ir::Node::Type::kVariable), vars.size(), i, d_name, p); @@ -617,8 +623,9 @@ void MultiDevSSAGraphBuilder::InsertDataBalanceOp( } } -int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::GetOpDeviceID( + const ir::Graph &graph, ir::Node *node, + const std::unordered_map &sharded_var_device) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } @@ -631,15 +638,15 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(graph, param_grad[1]); + int dev_id = GetVarDeviceID(graph, param_grad[1], sharded_var_device); PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } -int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, - const std::string &varname) const { - auto &sharded_var_device = graph.Get(kShardedVarDevice); +int MultiDevSSAGraphBuilder::GetVarDeviceID( + const ir::Graph &graph, const std::string &varname, + const std::unordered_map &sharded_var_device) const { auto got = sharded_var_device.find(varname); return got == sharded_var_device.end() ? -1 : got->second; } @@ -690,7 +697,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get(kGraphOps).back().get(); + auto *op_handle = result->Get(kGraphOps).back(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; @@ -698,7 +705,7 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); - op_handle->AddInput(prev_grad.get()); + op_handle->AddInput(prev_grad); } auto &vars = result->Get(kGraphVars)[dst_dev_id][og]; auto var = @@ -709,8 +716,9 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateDistTrainOp( + ir::Graph *result, ir::Node *node, + std::unordered_map *sharded_var_device) const { int op_dev_id = -1; std::vector input_var_names; std::vector output_var_names; @@ -725,23 +733,22 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, node->Op()->Type() == "split_selected_rows" || node->Op()->Type() == "split_ids") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(*result, input_var_names[0]); + op_dev_id = + GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - result->Get(kShardedVarDevice) - .emplace(varname, op_dev_id); + sharded_var_device->emplace(varname, op_dev_id); } } for (auto &varname : output_var_names) { - result->Get(kShardedVarDevice) - .emplace(varname, op_dev_id); + sharded_var_device->emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { - op_dev_id = GetVarDeviceID(*result, input_var_names[0]); + op_dev_id = + GetVarDeviceID(*result, input_var_names[0], *sharded_var_device); for (auto &varname : output_var_names) { - result->Get(kShardedVarDevice) - .emplace(varname, op_dev_id); + sharded_var_device->emplace(varname, op_dev_id); } } else { LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); @@ -759,14 +766,14 @@ int MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, } void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { - auto *op_handle = result->Get(kGraphOps).back().get(); + auto *op_handle = result->Get(kGraphOps).back(); for (ir::Node *input : node->inputs) { VarHandle *var = nullptr; for (int place_offset = 0; place_offset < num_places; ++place_offset) { auto &var_holders = result->Get(kGraphVars)[place_offset]; auto &var_holder = var_holders[input->Name()]; if (!var_holder.empty()) { - var = var_holder.rbegin()->get(); + var = *var_holder.rbegin(); op_handle->AddInput(var); } } @@ -774,12 +781,14 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { } // Create RPC related op handles that connects its in ops and out ops. -int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, - ir::Node *node) const { +int MultiDevSSAGraphBuilder::CreateRPCOp( + ir::Graph *result, ir::Node *node, + std::unordered_map *sharded_var_device) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name()); + op_dev_id = + GetVarDeviceID(*result, node->inputs[0]->Name(), *sharded_var_device); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -797,11 +806,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, VLOG(100) << "send grad " << input_var_names[0] << " origin " << send_param_grad[1] << " place: " << op_dev_id; for (auto &varname : input_var_names) { - result->Get(kShardedVarDevice) - .emplace(varname, op_dev_id); + sharded_var_device->emplace(varname, op_dev_id); } - result->Get(kShardedVarDevice) - .emplace(send_param_grad[1], op_dev_id); + sharded_var_device->emplace(send_param_grad[1], op_dev_id); } } else if (node->Op()->Type() == "recv") { std::vector output_var_names; @@ -811,7 +818,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); if (recv_param_grad.size() == 2U) { - op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]); + op_dev_id = + GetVarDeviceID(*result, recv_param_grad[1], *sharded_var_device); VLOG(100) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] << " place: " << op_dev_id; @@ -819,8 +827,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, op_dev_id = GetAppropriateDeviceID(output_var_names); } for (auto &varname : output_var_names) { - result->Get(kShardedVarDevice) - .emplace(varname, op_dev_id); + sharded_var_device->emplace(varname, op_dev_id); } } else { // send_barrier, fetch_barrier will run on place 0; @@ -839,7 +846,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, // send_barrier, recv, fetch_barrier's inputs are deps var, get them from // all places auto p = places_[op_dev_id]; - auto *op_handle = result->Get(kGraphOps).back().get(); + auto *op_handle = result->Get(kGraphOps).back(); op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -847,7 +854,8 @@ int MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, for (ir::Node *output : node->outputs) { int outvar_dev_id = op_dev_id; if (node->Op()->Type() == "fetch_barrier") { - outvar_dev_id = GetVarDeviceID(*result, output->Name()); + outvar_dev_id = + GetVarDeviceID(*result, output->Name(), *sharded_var_device); PADDLE_ENFORCE_NE(outvar_dev_id, -1); } p = places_[outvar_dev_id]; diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 03b2de2f04da4bac8d342a76c80fd12beaeba4b7..f3ec2d29415240b7012f458070223469d0947166 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -44,12 +44,18 @@ class MultiDevSSAGraphBuilder : public ir::Pass { mutable platform::NCCLContextMap *nccl_ctxs_; #endif - int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const; + int GetVarDeviceID( + const ir::Graph &graph, const std::string &varname, + const std::unordered_map &sharded_var_device) const; bool IsScaleLossOp(ir::Node *node) const; - int CreateRPCOp(ir::Graph *result, ir::Node *node) const; - int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + int CreateRPCOp( + ir::Graph *result, ir::Node *node, + std::unordered_map *sharded_var_device) const; + int CreateDistTrainOp( + ir::Graph *result, ir::Node *node, + std::unordered_map *sharded_var_device) const; std::vector FindDistTrainSendVars( const std::vector &nodes) const; @@ -69,7 +75,9 @@ class MultiDevSSAGraphBuilder : public ir::Pass { void CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const; - int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const; + int GetOpDeviceID( + const ir::Graph &graph, ir::Node *node, + const std::unordered_map &sharded_var_device) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index 361c91dc78c08a2cbf84ee88211d389c1e2312e5..8f92f0948d7d397ab0f20c01eae9e313f739adec 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { @@ -62,7 +63,7 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, }); size_t op_id = 0; - for (auto &op : graph.Get(kGraphOps)) { + for (auto &op : ir::FilterByNodeWrapper(graph)) { std::string op_name = "op_" + std::to_string(op_id++); sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" << std::endl; diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index 175c5a9950be69d7bf6ae9e386af762007a18a51..1a2b75fbc0c28984ce5cf00e0a2ce0f804349bb1 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -35,23 +35,14 @@ namespace details { // The outside vector is the device vector. Each element of this vector is a // map from variable name to variables. The variables, who have the same name, // will have a differsent version. The offset in the -// `std::vector>` is the version of varaibles. -typedef std::vector< - std::unordered_map>>> +// `std::vector` is the version of varaibles. +typedef std::vector>> GraphVars; const char kGraphVars[] = "vars"; // aux variables to represent dependency. Useful to resolve data hazard. -typedef std::unordered_set> GraphDepVars; +typedef std::unordered_set GraphDepVars; const char kGraphDepVars[] = "dep_vars"; - -// all operators. NOTE that even we use a vector here, the operators is -// unordered. -typedef std::vector> GraphOps; -const char kGraphOps[] = "ops"; - -typedef std::unordered_map ShardedVarDevice; -const char kShardedVarDevice[] = "sharded_var_device"; } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc index 65dafd376f7c687410270e35f105ff595fe78f59..4838c4198ff35ba3fb562f3a7c0563ee60179e3b 100644 --- a/paddle/fluid/framework/details/op_graph_view.cc +++ b/paddle/fluid/framework/details/op_graph_view.cc @@ -20,19 +20,16 @@ namespace paddle { namespace framework { namespace details { -OpGraphView::OpGraphView( - const std::vector> &ops) { - Build(ops); -} +OpGraphView::OpGraphView(const std::vector &ops) { Build(ops); } -void OpGraphView::Build(const std::vector> &ops) { +void OpGraphView::Build(const std::vector &ops) { for (auto &op : ops) { - preceding_ops_[op.get()]; - pending_ops_[op.get()]; + preceding_ops_[op]; + pending_ops_[op]; for (auto &var : op->Outputs()) { for (auto &pending_op : var->PendingOps()) { - preceding_ops_[pending_op].insert(op.get()); - pending_ops_[op.get()].insert(pending_op); + preceding_ops_[pending_op].insert(op); + pending_ops_[op].insert(pending_op); } } } @@ -41,8 +38,6 @@ void OpGraphView::Build(const std::vector> &ops) { "There are duplicate ops in graph."); } -size_t OpGraphView::OpNumber() const { return preceding_ops_.size(); } - std::unordered_set OpGraphView::AllOps() const { std::unordered_set ret; for (auto &pair : preceding_ops_) { @@ -60,12 +55,6 @@ void OpGraphView::EnforceHasOp(OpHandleBase *op) const { op == nullptr ? "nullptr" : op->DebugString()); } -const std::unordered_set &OpGraphView::PrecedingOps( - OpHandleBase *op) const { - EnforceHasOp(op); - return preceding_ops_.at(op); -} - const std::unordered_set &OpGraphView::PendingOps( OpHandleBase *op) const { EnforceHasOp(op); diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h index 398c019be00a6ff5f5b39fdcbe97339341b1685b..afb3e8e59461eeba10d7027fc70b89cc170c1805 100644 --- a/paddle/fluid/framework/details/op_graph_view.h +++ b/paddle/fluid/framework/details/op_graph_view.h @@ -26,21 +26,16 @@ namespace details { class OpGraphView { public: - explicit OpGraphView(const std::vector> &ops); - - size_t OpNumber() const; + explicit OpGraphView(const std::vector &ops); std::unordered_set AllOps() const; - const std::unordered_set &PrecedingOps( - OpHandleBase *op) const; - const std::unordered_set &PendingOps(OpHandleBase *op) const; bool HasOp(OpHandleBase *op) const; private: - void Build(const std::vector> &ops); + void Build(const std::vector &ops); void EnforceHasOp(OpHandleBase *op) const; std::unordered_map> diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index d09b94a3fd32952985a37cf4246c7640d2db4f56..ba12ca3c61c05b3e856fffa8353d4ec5bf79bc39 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -31,7 +31,10 @@ constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; // It's responsible for populating necessary fields of ir::Node. class OpHandleBase { public: - explicit OpHandleBase(ir::Node *node) : node_(node) {} + // Owned by `node`. No need to be deleted explicitly. + explicit OpHandleBase(ir::Node *node) : node_(node) { + node_->WrappedBy(this); + } virtual ~OpHandleBase(); diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 3a9a58412391b188c5e804b41fa47b3607a36bd1..72299c0bfa916d3b92e1c5020ddd69dadad3701d 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -30,8 +30,8 @@ struct TestReduceOpHandle { Scope g_scope_; std::vector local_scopes_; std::vector param_scopes_; - std::unique_ptr op_handle_; - std::vector> vars_; + OpHandleBase *op_handle_; + std::vector vars_; std::vector gpu_list_; std::vector> ctxs_; diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 955f075edac3273c262ad53b660a7b9065e31ec7..28443cc886e4c3f5db707d6d8fe9971618d8c2f7 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/reference_count_pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { @@ -71,14 +72,13 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( // Step 2: Find all variables in non-computation ops which refers to variables // in computation ops std::unordered_set names; - std::unordered_map> + std::unordered_map compute_ref_cnt_map; auto get_ref_cnts_from_compute_op = [&]( - const std::unique_ptr &op, - const std::vector &vars) { + OpHandleBase *op, const std::vector &vars) { std::vector var_names_in_op; - auto *compute_op = dynamic_cast(op.get()); + auto *compute_op = dynamic_cast(op); if (compute_op == nullptr || !platform::is_gpu_place(compute_op->GetPlace())) return var_names_in_op; @@ -121,9 +121,8 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( }; auto update_ref_cnts_from_non_compute_op = [&]( - const std::unique_ptr &op, - const std::vector &vars) { - if (dynamic_cast(op.get()) != nullptr) return; + OpHandleBase *op, const std::vector &vars) { + if (dynamic_cast(op) != nullptr) return; for (VarHandleBase *var_handle_base : vars) { auto *var_handle = dynamic_cast(var_handle_base); if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; @@ -151,21 +150,21 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( ref_cnt_node, next_compute_op->GetScope(), place, {var_name}, gcs[place.device].get(), cur_ref_cnts[place.device].get()); AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[next_compute_op].reset(ref_cnt_handle); + compute_ref_cnt_map[next_compute_op] = ref_cnt_handle; } } } } }; - auto &all_ops = graph->Get(kGraphOps); + auto all_ops = ir::FilterByNodeWrapper(*graph); for (auto &op : all_ops) { auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs()); auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs()); if (in_var_names.empty() && out_var_names.empty()) continue; in_var_names.insert(in_var_names.end(), out_var_names.begin(), out_var_names.end()); - auto *compute_op = dynamic_cast(op.get()); + auto *compute_op = dynamic_cast(op); auto place = boost::get(compute_op->GetPlace()); ir::Node *ref_cnt_node = graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation); @@ -173,7 +172,7 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( ref_cnt_node, compute_op->GetScope(), place, in_var_names, gcs[place.device].get(), cur_ref_cnts[place.device].get()); AddDependencyBetween(compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[compute_op].reset(ref_cnt_handle); + compute_ref_cnt_map[compute_op] = ref_cnt_handle; } for (auto &op : all_ops) { @@ -181,11 +180,11 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( update_ref_cnts_from_non_compute_op(op, op->Outputs()); } - std::vector> new_all_ops; + std::vector new_all_ops; new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size()); for (auto &op : all_ops) { new_all_ops.emplace_back(std::move(op)); - auto it = compute_ref_cnt_map.find(new_all_ops.back().get()); + auto it = compute_ref_cnt_map.find(new_all_ops.back()); if (it != compute_ref_cnt_map.end()) { // Add LeafNode to ReferenceCountOpHandle auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc index 780da5478ff34ecd7096d0ef62b72bf1088dd221..af2cbd5c876fdd7c27cd679f7e9412d1b0604ecc 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/ssa_graph_executor.cc @@ -19,14 +19,16 @@ namespace framework { namespace details { SSAGraphExecutor::~SSAGraphExecutor() {} -void ClearFetchOp(ir::Graph* graph, - std::vector>* fetch_ops) { +void ClearFetchOp(ir::Graph* graph, std::vector* fetch_ops) { if (fetch_ops->empty()) return; for (auto& op : *fetch_ops) { for (auto& out_var : op->Node()->outputs) { graph->RemoveNode(out_var); } + for (auto& in_var : op->Inputs()) { + in_var->RemoveOutput(op, op->Node()); + } graph->RemoveNode(op->Node()); } fetch_ops->clear(); diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index d5cf7737d565c523995e6685b73c57e5a6f0197b..860eaa25b58e4579ad792ff18618de3b90707e8d 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -38,8 +38,7 @@ class SSAGraphExecutor { virtual FeedFetchList Run(const std::vector& fetch_tensors) = 0; }; -void ClearFetchOp(ir::Graph* graph, - std::vector>* fetch_ops); +void ClearFetchOp(ir::Graph* graph, std::vector* fetch_ops); } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index de22191c5a059c67c306c3245d068153382497ff..f781f02a076594b5a70fd4863ebf273e88607dfd 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { @@ -51,25 +52,25 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto &var_map : graph_->Get(details::kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { - InsertPendingVar(&pending_vars, ready_vars.get(), version_pair.get()); + InsertPendingVar(&pending_vars, ready_vars.get(), version_pair); } } } for (auto &var : graph_->Get(details::kGraphDepVars)) { - InsertPendingVar(&pending_vars, ready_vars.get(), var.get()); + InsertPendingVar(&pending_vars, ready_vars.get(), var); } - for (auto &op : graph_->Get(details::kGraphOps)) { + for (auto &op : ir::FilterByNodeWrapper(*graph_)) { if (op->Inputs().empty()) { // Special case, Op has no input. - ready_ops.insert(op.get()); + ready_ops.insert(op); } else { - InsertPendingOp(&pending_ops, op.get()); + InsertPendingOp(&pending_ops, op); } } // Step 2. Insert FetchOps - std::vector> fetch_ops; - std::unordered_set> fetch_dependencies; + std::vector fetch_ops; + std::unordered_set fetch_dependencies; FeedFetchList fetch_data(fetch_tensors.size()); InsertFetchOps(fetch_tensors, &fetch_ops, &fetch_dependencies, &pending_ops, @@ -109,6 +110,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } + ClearFetchOp(graph_.get(), &fetch_ops); exception_holder_.ReThrow(); } else { continue; @@ -140,8 +142,8 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( void ThreadedSSAGraphExecutor::InsertFetchOps( const std::vector &fetch_tensors, - std::vector> *fetch_ops, - std::unordered_set> *fetch_dependencies, + std::vector *fetch_ops, + std::unordered_set *fetch_dependencies, std::unordered_map *pending_ops, std::unordered_set *pending_vars, BlockingQueue *ready_vars, FeedFetchList *fetch_data) { @@ -151,7 +153,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( for (auto &var_map : graph_->Get(details::kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { - fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get()); + fetched_vars[fetch_var_name].push_back(*it->second.rbegin()); } } } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 5c0bc169eaf3f54596eb8e08b7bf80a82253c9b2..24da56c09e3e0f3894d58e5af8838c98e3e1e67c 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -70,13 +70,13 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { BlockingQueue *ready_vars, VarHandleBase *var) const; - void InsertFetchOps( - const std::vector &fetch_tensors, - std::vector> *fetch_ops, - std::unordered_set> *fetch_dependencies, - std::unordered_map *pending_ops, - std::unordered_set *pending_vars, - BlockingQueue *ready_vars, FeedFetchList *fetch_data); + void InsertFetchOps(const std::vector &fetch_tensors, + std::vector *fetch_ops, + std::unordered_set *fetch_dependencies, + std::unordered_map *pending_ops, + std::unordered_set *pending_vars, + BlockingQueue *ready_vars, + FeedFetchList *fetch_data); private: ExecutionStrategy strategy_; diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc index 5457870e9ff5d7cf67c9c7076b9aae94eeada779..30da029ca2a90e7faa6288557ff2f1aeb21cc1c6 100644 --- a/paddle/fluid/framework/details/var_handle.cc +++ b/paddle/fluid/framework/details/var_handle.cc @@ -20,6 +20,8 @@ namespace details { VarHandleBase::~VarHandleBase() {} +VarHandle::~VarHandle() { VLOG(4) << "deleting var handle " << DebugString(); } + std::string VarHandle::DebugString() const { std::stringstream ss; ss << name_ << ":" << place_; @@ -27,6 +29,10 @@ std::string VarHandle::DebugString() const { } std::string DummyVarHandle::DebugString() const { return node_->Name(); } + +DummyVarHandle::~DummyVarHandle() { + VLOG(4) << "deleting dummy var handle " << DebugString(); +} } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index a1f458c660ce9f73bc9ac2ed194091ad0b8f8400..3b007d7b1a52df765a2dbd41939f8f865123cb43 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -35,7 +35,10 @@ class OpHandleBase; // A variable can only be generated by a single operator. i.e. // This is a single assignment graph. struct VarHandleBase { - explicit VarHandleBase(ir::Node* node) : node_(node) {} + // Owned by `node`. No need to be deleted explicitly. + explicit VarHandleBase(ir::Node* node) : node_(node) { + node_->WrappedBy(this); + } virtual ~VarHandleBase(); @@ -94,6 +97,8 @@ struct VarHandleBase { struct VarHandle : public VarHandleBase { explicit VarHandle(ir::Node* node) : VarHandleBase(node) {} + virtual ~VarHandle(); + std::string DebugString() const override; VarHandle(ir::Node* node, size_t version, size_t scope_index, @@ -121,6 +126,8 @@ struct VarHandle : public VarHandleBase { struct DummyVarHandle : public VarHandleBase { explicit DummyVarHandle(ir::Node* node) : VarHandleBase(node) {} + virtual ~DummyVarHandle(); + std::string DebugString() const override; }; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 28231a53bad50fe9f19cfe3e73c3dc09aa3762cf..4cf973253cc4f1f22d2fc578a1ac3a8c95e479c9 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -53,6 +53,7 @@ set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library") cc_library(pass_builder SRCS pass_builder.cc DEPS pass) +cc_test(node_test SRCS node_test.cc DEPS node) cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 46501f8d581daa6173da3830e353ef5aab4ae6ad..6384d89d2f2af4ab1d733af5eb1561cab2d09728 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -102,6 +102,15 @@ class Graph { attr_dels_[attr_name] = []() {}; } + template + void Erase(const std::string &attr_name) { + PADDLE_ENFORCE(attrs_.count(attr_name) != 0, "%s not set in the graph", + attr_name); + attr_dels_[attr_name](); + attrs_.erase(attr_name); + attr_dels_.erase(attr_name); + } + const std::unordered_set &Nodes() const { return node_set_; } // Create a normal variable with non-null VarDesc. diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index ec46b38c01b8c369ab37b4fbd5497ec120d8db91..8d92c406689ab3a97596a8666ceb452aec4be170 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -37,6 +37,15 @@ std::vector TopologySortOperations(const Graph &graph); std::map> BuildOperationAdjList( const Graph &graph); +template +std::vector FilterByNodeWrapper(const Graph &graph) { + std::vector ret; + for (ir::Node *n : graph.Nodes()) { + if (n->IsWrappedBy()) ret.push_back(&n->Wrapper()); + } + return ret; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index a3be133344ac9b844fcfd7dd923a75cde3c5ebdd..b92245d862ab82affa89a2c3afef1d742bfcc1f0 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -15,7 +15,10 @@ limitations under the License. */ #pragma once #include +#include +#include #include + #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/macros.h" @@ -24,9 +27,33 @@ namespace paddle { namespace framework { namespace ir { -// Node should normally created by Graph::CreateXXXNode(). +// Node should only created by Graph::CreateXXXNode(). +// 1. Every Node should be part of a graph. No dangling Node exists. +// 2. Node only contains members necessary for building graph structure. +// It doesn't contain other unrelated members, such as device, etc. +// +// Sometimes, for specific usages, Node needs to have additional members, +// such as device_placement, version in order to be executed. It is suggested +// to use composition pattern. +// +// class RunnableOp { +// RunnableOp(ir::Node* n) : n_(n) { n_.WrappedBy(this); } +// +// int any_thing_; +// } +// +// RunnableOp is owned by the ir::Node that composes it. In other words. +// ir::Node will be responsible for deleting RunnableOp, say, when ir::Node +// is deleted from the graph. class Node { public: + virtual ~Node() { + if (!wrapper_.empty()) { + VLOG(4) << "ir::Node deleting a wrapper node " << Name(); + wrapper_deleter_(); + } + } + enum class Type { kOperation, kVariable }; #if !defined(_WIN32) // msvc not support constexpr correctly. static constexpr char kControlDepVarName[] = "__control_var"; @@ -48,6 +75,29 @@ class Node { return op_desc_.get(); } + // Set the `wrapper` that wraps the Node. `wrapper` is owned by Node. + template + void WrappedBy(T* wrapper) { + if (!wrapper_.empty()) { + wrapper_deleter_(); + } + wrapper_ = wrapper; + wrapper_deleter_ = [wrapper]() { delete wrapper; }; + wrapper_type_ = std::type_index(typeid(T)); + } + + // Return a reference to the `wrapper`. + template + T& Wrapper() { + return *boost::any_cast(wrapper_); + } + + // Test if the Node is wrapped by type T. + template + bool IsWrappedBy() { + return std::type_index(typeid(T)) == wrapper_type_; + } + // Please don't use this API! int id() const { return id_; } @@ -99,6 +149,11 @@ class Node { static int count_; // Please don't use this API or make this public. static void ResetId() { count_ = 0; } + + boost::any wrapper_; + std::function wrapper_deleter_; + std::type_index wrapper_type_ = std::type_index(typeid(void)); + DISABLE_COPY_AND_ASSIGN(Node); }; diff --git a/paddle/fluid/framework/ir/node_test.cc b/paddle/fluid/framework/ir/node_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..694efadda078169c993457181c00f7b357a09e87 --- /dev/null +++ b/paddle/fluid/framework/ir/node_test.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class RunnableOp { + public: + RunnableOp(Node* node, bool* alive) : node_(node), alive_(alive) { + node_->WrappedBy(this); + } + + virtual ~RunnableOp() { *alive_ = false; } + + private: + Node* node_; + bool* alive_; +}; + +class RunnableOp2 { + public: + RunnableOp2(Node* node, bool* alive) : node_(node), alive_(alive) { + node_->WrappedBy(this); + } + + virtual ~RunnableOp2() { *alive_ = false; } + + private: + Node* node_; + bool* alive_; +}; + +TEST(NodeTest, Basic) { + bool alive1 = true; + bool alive2 = true; + std::unique_ptr n1(CreateNodeForTest("n1", Node::Type::kVariable)); + std::unique_ptr n2(CreateNodeForTest("n2", Node::Type::kVariable)); + + EXPECT_FALSE(n1->IsWrappedBy()); + EXPECT_FALSE(n1->IsWrappedBy()); + EXPECT_FALSE(n2->IsWrappedBy()); + EXPECT_FALSE(n2->IsWrappedBy()); + + new RunnableOp(n1.get(), &alive1); + new RunnableOp2(n2.get(), &alive2); + + EXPECT_TRUE(n1->IsWrappedBy()); + EXPECT_FALSE(n1->IsWrappedBy()); + EXPECT_FALSE(n2->IsWrappedBy()); + EXPECT_TRUE(n2->IsWrappedBy()); + + EXPECT_TRUE(alive1); + EXPECT_TRUE(alive2); + + n1.reset(nullptr); + n2.reset(nullptr); + EXPECT_FALSE(alive1); + EXPECT_FALSE(alive2); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 5e55acf892718223263e9c689d64316dc6682780..67a0b2eb3b18a0d12481c7205525ec47c4ac1f8a 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -38,8 +38,8 @@ if(WITH_TESTING) ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book) set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification) endif() -cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} paddle_inference_api - ARGS --dirname=${PYTHON_TESTS_DIR}/book) +cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor ${inference_deps} + ARGS --dirname=${WORD2VEC_MODEL_DIR}) if(WITH_GPU AND TENSORRT_FOUND) cc_library(paddle_inference_tensorrt_subgraph_engine diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 13c25da1b52742e6114b294847c21ce735b9fc21..f75c45f3a0438bc437e716160af8c5eab5b10fce 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -24,7 +24,7 @@ using contrib::AnalysisConfig; TEST(AnalysisPredictor, ZeroCopy) { AnalysisConfig config; - config.model_dir = FLAGS_dirname + "/word2vec.inference.model"; + config.model_dir = FLAGS_dirname; config.use_feed_fetch_ops = false; auto predictor = CreatePaddlePredictor(config); diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f109dd685c87ab1b0776a855bb5f510eab1f5526 --- /dev/null +++ b/paddle/fluid/operators/space_to_depth_op.cc @@ -0,0 +1,131 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/space_to_depth_op.h" +#include +#include + +namespace paddle { +namespace operators { + +class SpaceToDepthOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of SpaceToDepthOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of SpaceToDepthOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE_EQ(x_dims.size(), 4, "input should be a 4D tensor"); + auto blocksize = ctx->Attrs().Get("blocksize"); + + PADDLE_ENFORCE_GT(blocksize, 1, "The blocksize should be Greater than 1"); + PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0"); + PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0"); + PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0"); + + PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0, + "input channel should be divisible of the square of " + "SpaceToDepthOp blocksize"); + PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0, + "input Height should be divisible of the square of " + "SpaceToDepthOp blocksize"); + PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0, + "input Width should be divisible of the square of " + "SpaceToDepthOp blocksize"); + + VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims + << "Attribute blocksize" << blocksize << std::endl; + + std::vector output_shape(4, 0); // [B,C,H,W] + output_shape[0] = x_dims[0]; + output_shape[1] = x_dims[1] * blocksize * blocksize; + output_shape[2] = x_dims[2] / blocksize; + output_shape[3] = x_dims[3] / blocksize; + + auto out_dims = framework::make_ddim(output_shape); + + ctx->SetOutputDim("Out", out_dims); + + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } +}; + +class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor). The input should be a 4D tensor B * C * W * H of " + "SpaceToDepthOp " + "operator."); + AddOutput("Out", + "(Tensor), The output should be a 4D tensor B * C2 * W2 * H2 of " + "SpaceToDepthOp operator."); + AddAttr( + "blocksize", + "(int64_t, default 2) blocksize used to do change Space To Depth.") + .SetDefault(2) + .GreaterThan(1); + AddComment(R"DOC( + reorg operator used in Yolo v2. + The equation is: C2 = C1/blocksize * blocksize, W2 = W1 ∗ blocksize + offset % blocksize, H2 = H1 ∗ blocksize + offset / blocksize, + + Reshape Input(X) into the shape according to Attr(blocksize). The + data in Input(X) are unchanged. + + Examples: + + 1. Given a 4-D tensor Input(X) with a shape [128, 2048, 26, 26], and the blocksize is 2, the reorg operator will transform Input(X) + into a 4-D tensor with shape [128, 2048, 13, 13] and leaving Input(X)'s data unchanged. + + )DOC"); + } +}; + +class SpaceToDepthGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) shouldn't be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(space_to_depth, ops::SpaceToDepthOp, ops::SpaceToDepthOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp); +REGISTER_OP_CPU_KERNEL( + space_to_depth, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel); +REGISTER_OP_CPU_KERNEL( + space_to_depth_grad, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel); diff --git a/paddle/fluid/operators/space_to_depth_op.cu b/paddle/fluid/operators/space_to_depth_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..38d0a662733222386b8ecd68d064f3d1abe56c3b --- /dev/null +++ b/paddle/fluid/operators/space_to_depth_op.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/space_to_depth_op.h" + +namespace plat = paddle::platform; +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + space_to_depth, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel, + ops::SpaceToDepthKernel); + +REGISTER_OP_CUDA_KERNEL( + space_to_depth_grad, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel, + ops::SpaceToDepthGradKernel); diff --git a/paddle/fluid/operators/space_to_depth_op.h b/paddle/fluid/operators/space_to_depth_op.h new file mode 100644 index 0000000000000000000000000000000000000000..a71662b4813ab27b65f5c7a918e2bb6fb15a1993 --- /dev/null +++ b/paddle/fluid/operators/space_to_depth_op.h @@ -0,0 +1,127 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifndef PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_ +#define PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_ +#endif // PADDLE_FLUID_OPERATORS_SPACE_TO_DEPTH_OP_H_ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +template +class space_to_depth_compute { + public: + HOSTDEVICE space_to_depth_compute(const T *x, int64_t w, int64_t h, int64_t c, + int64_t batch, int64_t blocksize, + int64_t forward, T *out) + : x_(x), + w_(w), + h_(h), + c_(c), + batch_(batch), + blocksize_(blocksize), + forward_(forward), + out_(out) {} + + HOSTDEVICE void operator()(int64_t in_index) { + int64_t out_c = c_ / (blocksize_ * blocksize_); + // calculate each dim position with index of tensor + int64_t b = in_index / (c_ * h_ * w_); + int64_t k = (in_index % (c_ * h_ * w_)) / (h_ * w_); + int64_t j = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) / w_; + int64_t i = ((in_index % (c_ * h_ * w_)) % (h_ * w_)) % w_; + + int64_t c2 = k % out_c; + int64_t offset = k / out_c; + int64_t w2 = i * blocksize_ + offset % blocksize_; + int64_t h2 = j * blocksize_ + offset / blocksize_; + int64_t out_index = + w2 + w_ * blocksize_ * (h2 + h_ * blocksize_ * (c2 + out_c * b)); + if (forward_) + out_[out_index] = x_[in_index]; + else + out_[in_index] = x_[out_index]; + } + + private: + const T *x_; + int64_t w_, h_, c_, batch_, blocksize_, forward_; + T *out_; +}; + +template +class SpaceToDepthKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *out = context.Output("Out"); + auto *x = context.Input("X"); + auto blocksize = context.Attr("blocksize"); + auto in_dims = x->dims(); + out->mutable_data(context.GetPlace(), x->type()); + + auto out_dims = out->dims(); + auto B = in_dims[0]; + auto C = in_dims[1]; + auto H = in_dims[2]; + auto W = in_dims[3]; + platform::ForRange for_range( + context.template device_context(), + static_cast(x->numel())); + + auto *x_data = x->data(); + auto *out_data = out->data(); + paddle::operators::space_to_depth_compute computer( + x_data, W, H, C, B, blocksize, 1, out_data); + for_range(computer); + + out->Resize(out_dims); + } +}; + +template +class SpaceToDepthGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *d_out = + context.Input(framework::GradVarName("Out")); + auto *d_x = + context.Output(framework::GradVarName("X")); + auto blocksize = context.Attr("blocksize"); + auto in_dims = d_x->dims(); + d_x->mutable_data(context.GetPlace(), d_out->type()); + + auto B = in_dims[0]; + auto C = in_dims[1]; + auto H = in_dims[2]; + auto W = in_dims[3]; + + platform::ForRange for_range( + context.template device_context(), + static_cast(d_x->numel())); + + auto *dx_data = d_x->data(); + auto *dout_data = d_out->data(); + + paddle::operators::space_to_depth_compute computer( + dout_data, W, H, C, B, blocksize, 0, dx_data); + for_range(computer); + + d_x->Resize(in_dims); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d85a3e6dcec406b42034d174347e51e7c72dacd7..b0a8efd5edcf4de7053438d63ba0315997d4e280 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -154,6 +154,7 @@ __all__ = [ 'mul', 'sigmoid_cross_entropy_with_logits', 'maxout', + 'space_to_depth', 'affine_grid', 'sequence_reverse', 'affine_channel', @@ -7674,6 +7675,66 @@ def maxout(x, groups, name=None): return out +def space_to_depth(x, blocksize, name=None): + """ + Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width] + + This op rearranges blocks of spatial data, into depth. More specifically, this op outputs a copy of the + input LoDtensor where values from the height and width dimensions are moved to the channel dimension. + The attr blocksize indicates the input block size. + + space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] according + to blocksize to construct output with shape [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]: + + space_to_depth is used to This operation is useful for resizing the activations between convolutions + (but keeping all data) + + - Non-overlapping blocks of size block_size x block size are rearranged into depth at each location. + - The depth of the output tensor is block_size * block_size * input channel + - The Y, X coordinates within each block of the input become the high order component of the output channel index + - channel should be divisible by square of blocksize + - height, width should be divsible by blocksize + + + Args: + x(variable): The input LoDtensor. + blocksize(variable): The blocksize to select the element on each feature map should be > 2 + + Returns: + Variable: The output LoDtensor. + + Raises: + TypeError: blocksize type must be a long. + + Examples: + .. code-block:: python + + data = fluid.layers.data( + name='data', shape=[1, 4, 2, 2], dtype='float32') + space_to_depthed = fluid.layers.space_to_depth( + x=data, blocksize=2) + """ + + helper = LayerHelper("space_to_depth", **locals()) + + if not (isinstance(blocksize, int)): + raise ValueError("blocksize must be a python Int") + + if name is None: + out = helper.create_variable_for_type_inference( + dtype=x.dtype) #fix create + else: + out = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type="space_to_depth", + inputs={"X": x}, + attrs={"blocksize": blocksize}, + outputs={"Out": out}) + return out + + @templatedoc() def sequence_reverse(x, name=None): """ diff --git a/python/paddle/fluid/op.py b/python/paddle/fluid/op.py index 4e1d1450dea85fe4eb3e68713250836e4beac992..b8bb3db1eedcf25c9b6a02ad3b4f261e8be8efce 100644 --- a/python/paddle/fluid/op.py +++ b/python/paddle/fluid/op.py @@ -108,6 +108,8 @@ class OpDescCreationMethod(object): new_attr.i = user_defined_attr elif attr.type == framework_pb2.FLOAT: new_attr.f = user_defined_attr + elif attr.type == framework_pb2.LONG: + new_attr.l = user_defined_attr elif attr.type == framework_pb2.STRING: new_attr.s = user_defined_attr elif attr.type == framework_pb2.BOOLEAN: diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index c4ecc2c2c2563fcad09821453ee73e41f81407d5..49ba41e6fc908e9713414120bbeb45ca715042c3 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -248,6 +248,17 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.softmax(hid)) print(str(program)) + def test_space_to_depth(self): + program = Program() + with program_guard(program): + data = layers.data( + name='data', + shape=[32, 9, 6, 6], + append_batch_size=False, + dtype='float32') + self.assertIsNotNone(layers.space_to_depth(data, 3)) + print(str(program)) + def test_sequence_unsqueeze(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py b/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py new file mode 100644 index 0000000000000000000000000000000000000000..5fdad44f1242b9ee99040b43d7ce2cf84664eed1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_space_to_depth_op.py @@ -0,0 +1,135 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import paddle.fluid as fluid +from op_test import OpTest + + +class TestSpaceToDepthOp(OpTest): + @staticmethod + def helper(in_, width, height, channel, batch, blocksize, forward, out_): + channel_out = channel // (blocksize * blocksize) + for b in range(batch): + for k in range(channel): + for j in range(height): + for i in range(width): + in_index = i + width * (j + height * (k + channel * b)) + channel2 = k % channel_out + offset = k // channel_out + width2 = i * blocksize + offset % blocksize + height2 = j * blocksize + offset // blocksize + out_index = width2 + width * blocksize * ( + height2 + height * blocksize * + (channel2 + channel_out * b)) + if forward: + out_[out_index] = in_[in_index] + else: + out_[in_index] = in_[out_index] + + def setUp(self): + self.init_data() + + self.op_type = "space_to_depth" + self.inputs = {"X": self.x} + self.helper(self.x_1d, self.x.shape[3], self.x.shape[2], + self.x.shape[1], self.x.shape[0], self.blocksize, + self.forward, self.out_1d) + self.out = np.reshape(self.out_1d, self.infered_shape) + self.attrs = {"blocksize": self.blocksize} + self.outputs = {"Out": self.out} + + def init_data(self): + self.ori_shape = (32, 12, 6, 6) + self.infered_shape = (32, 48, 3, 3) + self.one_d_len = 32 * 48 * 3 * 3 + + self.blocksize = 2 + self.x = np.random.random(self.ori_shape).astype('float32') + self.x_1d = np.reshape(self.x, self.one_d_len) + self.out = np.zeros(self.infered_shape).astype('float32') + self.out_1d = np.reshape(self.out, self.one_d_len) + self.forward = 1 + + def test_check_output(self): + place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.core.CPUPlace() + self.check_output_with_place(place, 1e-5, None, False) + + def test_check_grad(self): + place = fluid.core.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.core.CPUPlace() + self.check_grad_with_place(place, ['X'], 'Out') + + +class TestSpaceToDepthOpBasic(TestSpaceToDepthOp): + def init_data(self): + self.ori_shape = (32, 8, 6, 6) + self.infered_shape = (32, 32, 3, 3) + self.one_d_len = 32 * 32 * 3 * 3 + + self.blocksize = 2 + self.x = np.random.random(self.ori_shape).astype('float32') + self.x_1d = np.reshape(self.x, self.one_d_len) + self.out = np.zeros(self.infered_shape).astype('float32') + self.out_1d = np.reshape(self.out, self.one_d_len) + self.forward = 1 + + +class TestSpaceToDepthOpDoubleBasic(TestSpaceToDepthOp): + def init_data(self): + self.ori_shape = (32, 8, 6, 6) + self.infered_shape = (32, 32, 3, 3) + self.one_d_len = 32 * 32 * 3 * 3 + + self.blocksize = 2 + self.x = np.random.random(self.ori_shape).astype('float64') + self.x_1d = np.reshape(self.x, self.one_d_len) + self.out = np.zeros(self.infered_shape).astype('float64') + self.out_1d = np.reshape(self.out, self.one_d_len) + self.forward = 1 + + +class TestSpaceToDepthOpWithStride3(TestSpaceToDepthOp): + def init_data(self): + self.ori_shape = (32, 9, 6, 6) + self.infered_shape = (32, 81, 2, 2) + self.one_d_len = 32 * 81 * 2 * 2 + + self.blocksize = 3 + self.x = np.random.random(self.ori_shape).astype('float32') + self.x_1d = np.reshape(self.x, self.one_d_len) + self.out = np.zeros(self.infered_shape).astype('float32') + self.out_1d = np.reshape(self.out, self.one_d_len) + self.forward = 1 + + +class TestSpaceToDepthOpWithNotSquare(TestSpaceToDepthOp): + def init_data(self): + self.ori_shape = (32, 9, 9, 6) + self.infered_shape = (32, 81, 3, 2) + self.one_d_len = 32 * 81 * 3 * 2 + + self.blocksize = 3 + self.x = np.random.random(self.ori_shape).astype('float32') + self.x_1d = np.reshape(self.x, self.one_d_len) + self.out = np.zeros(self.infered_shape).astype('float32') + self.out_1d = np.reshape(self.out, self.one_d_len) + self.forward = 1 + + +if __name__ == '__main__': + unittest.main()