diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md index a33b5a9c9312c93247a1e1f3431061a5aad6c884..c29337cba1fe859e4968cb800e4e7d9ff6a54d31 100644 --- a/doc/fluid/design/ir/draft.md +++ b/doc/fluid/design/ir/draft.md @@ -64,6 +64,41 @@ can also contain other things that describe some properties of the `Graph` or `Graph` nodes. `Attribute` can be passed across `Pass`. However, it should be used with care. +```cpp +class Graph { + public: + explicit Graph(const ProgramDesc &program); + + bool Has(const std::string &attr_name) const; + + template + AttrType &Get(const std::string &attr_name) const; + + template + void Set(const std::string &attr_name, AttrType *attr); + const std::unordered_set &Nodes() const; + + // Create a normal variable with non-null VarDesc. + ir::Node *CreateVarNode(VarDesc *var_desc); + + // Create a normal runnable operator with OpDesc. + ir::Node *CreateOpNode(OpDesc *op_desc); + + // Create a control dependency var that connects 2 operations. The + // var doesn't hold any data. Other than that, it's no different from + // other var, considering dependency analysis. + ir::Node *CreateControlDepVar(); + + // A more free style way of creating a graph node. Mostly use for test + // or "copy" from another node. Avoid using it if possible. + ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type); + + // Clear all node information of the graph and return the ownership of the + // nodes. + std::vector> ReleaseNodes(); +}; +``` + #### Pass `Pass` represents a transformation of `Graph`. Its input @@ -71,6 +106,54 @@ is a `Graph` and its output is also a `Graph`. For example, a `Pass` can simply print out the `Graph`. A `Pass` can also fuse some `Graph`'s `Node`s. +```cpp +class Pass { + public: + + std::unique_ptr Apply(std::unique_ptr graph) const { + // Some correctness check. + auto new_graph = ApplyImpl(std::move(graph)); + // Some correctness check. + return new_graph; + } + + // Get a reference to the attributed previously set. + template + AttrType &Get(const std::string &attr_name) const; + + // Set a pointer to the attribute. Pass takes ownership of the attribute. + template + void Set(const std::string &attr_name, AttrType *attr) ; + + // Set a pointer to the attribute. Pass doesn't take ownership. Caller + // should delete the attribute. + template + void SetNotOwned(const std::string &attr_name, AttrType *attr); + + protected: + virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const = 0; +}; + +// In my_pass.cc +class MyPass : public Pass { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const override { + // do something. + return graph; + } +} +REGISTER_PASS(my_pass, MyPass) +.RequirePassAttr("places") +.RequireGraphAttr("dep_vars"); + + +// To use the pass. +auto my_pass = ir::PassRegistry::Instance().Get("my_pass"); +graph = my_pass->Apply(std::move(graph)); +// Note: to force link my_pass.cc, in the code: +USE_PASS(my_pass); +``` + #### Optimize `Optimize` contains a series of `Pass` with defined order. @@ -86,4 +169,17 @@ maintaining the original modeling logic. * Graph is transformed from raw model logic to a form that is efficient to execute. -Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor +``` +// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor +auto graph = Graph(program); +graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah)); +// For more complex Pass, Optimize Process can provide Pass attributes. +auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass"); +mem_opt_pass.SetNotOwned("optimize_level", 1); +mem_opt_pass->Apply(std::move(graph)); +graph = PassRegistry::Instance().Get("multi_device_pass").Apply(std::move(grah)); +graph = PassRegistry::Instance().Get("multi_device_check_pass").Apply(std::move(grah)); +Executor exe; +exe.Run(graph); + +``` diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 6efb03dabe89b28f3ff1a55c4a940dfe74e8001d..5f3bfa296546fcbc6a3410d7ae072ff74954bc74 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -170,6 +170,7 @@ paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], var paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) @@ -201,7 +202,6 @@ paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs= paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) -paddle.fluid.layers.While.complete ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.Switch.case ArgSpec(args=['self', 'condition'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.Switch.default ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) @@ -225,17 +225,14 @@ paddle.fluid.layers.DynamicRNN.static_input ArgSpec(args=['self', 'x'], varargs= paddle.fluid.layers.DynamicRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.DynamicRNN.update_memory ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.StaticRNN.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.memory ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)) paddle.fluid.layers.StaticRNN.output ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None) -paddle.fluid.layers.StaticRNN.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.step ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.step_input ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.step_output ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.StaticRNN.update_memory ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.reorder_lod_tensor_by_rank ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.ParallelDo.__init__ ArgSpec(args=['self', 'places', 'use_nccl', 'name'], varargs=None, keywords=None, defaults=(False, None)) -paddle.fluid.layers.ParallelDo.complete_op ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.ParallelDo.do ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.ParallelDo.get_parameters ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.ParallelDo.parent_block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index df2a7bf90d9be480c514d9dc70571c7f56fd8db2..139411f3e0d945f9265d19a28487c05d06722d69 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -99,7 +99,7 @@ else() endif() -cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 9df7df1f42886d40210b16aa2ae5823e3310bfe7..5d652d37307d0a55ffee14930ae180dcd3e27841 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -31,9 +31,6 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle) - -cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) - cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) diff --git a/paddle/fluid/framework/details/exception_holder.h b/paddle/fluid/framework/details/exception_holder.h new file mode 100644 index 0000000000000000000000000000000000000000..6e302a29233b96451df14b4685911be1cd87c1ab --- /dev/null +++ b/paddle/fluid/framework/details/exception_holder.h @@ -0,0 +1,83 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace details { + +class ExceptionHolder { + public: + void Catch(const platform::EnforceNotMet& exp) { + std::lock_guard lock(mu_); + exception_.reset(new platform::EnforceNotMet(exp)); + type_ = kEnforceNotMet; + } + + void Catch(const platform::EOFException& exp) { + std::lock_guard lock(mu_); + // EOFException will not cover up existing EnforceNotMet. + if (exception_.get() == nullptr) { + exception_.reset(new platform::EOFException(exp)); + type_ = kEOF; + } + } + + bool ExceptionCatched() const { + std::lock_guard lock(mu_); + return exception_.get() != nullptr; + } + + void Throw() { + std::lock_guard lock(mu_); + switch (type_) { + case kNone: + break; + case kEnforceNotMet: { + auto e = *static_cast(exception_.get()); + throw e; + break; + } + case kEOF: { + auto e = *static_cast(exception_.get()); + throw e; + break; + } + default: + LOG(FATAL) << "Unknown exception."; + } + exception_.reset(); + type_ = kNone; + } + + void Clear() { + std::lock_guard lock(mu_); + exception_.reset(); + type_ = kNone; + } + + private: + enum ExceptionType { kNone, kEnforceNotMet, kEOF }; + ExceptionType type_{kNone}; + + std::unique_ptr exception_; + mutable std::mutex mu_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index b7b67916205689753bc3f9fe844945ee3e78eeb4..5ca2ed8f96244a11925dfa6af8e48458cf334ecd 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -34,30 +34,22 @@ namespace paddle { namespace framework { namespace details { +static const char kLossVarName[] = "loss_var_name"; +static const char kPlaces[] = "places"; +static const char kParams[] = "params"; +static const char kLocalScopes[] = "local_scopes"; +static const char kStrategy[] = "strategy"; + +void MultiDevSSAGraphBuilder::Init() const { + loss_var_name_ = Get(kLossVarName); + places_ = Get>(kPlaces); + local_scopes_ = Get>(kLocalScopes); + strategy_ = Get(kStrategy); #ifdef PADDLE_WITH_CUDA -MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( - const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy) - : loss_var_name_(loss_var_name), - places_(places), - local_scopes_(local_scopes), - nccl_ctxs_(nccl_ctxs), - strategy_(strategy) { -#else -MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( - const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, const BuildStrategy &strategy) - : loss_var_name_(loss_var_name), - places_(places), - local_scopes_(local_scopes), - strategy_(strategy) { + nccl_ctxs_ = &Get("nccl_ctxs"); #endif - for (auto &p : params) { + + for (auto &p : Get>(kParams)) { grad_names_.insert(GradVarName(p)); } balance_vars_.resize(places_.size(), 0); @@ -72,7 +64,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, ir::Node *node, size_t place_id) const { auto p = places_[place_id]; - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -239,8 +231,9 @@ std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { return sorted_ret; } -std::unique_ptr MultiDevSSAGraphBuilder::Apply( +std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr graph) const { + Init(); // Give the topology sort order and rebuild the graph structure. std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); auto nodes = graph->ReleaseNodes(); @@ -254,9 +247,10 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 - result.Set("vars", new GraphVars(places_.size())); - result.Set("dep_vars", new GraphDepVars); - result.Set("ops", new GraphOps); + result.Set(kGraphVars, new GraphVars(places_.size())); + result.Set(kGraphDepVars, new GraphDepVars); + result.Set(kGraphOps, new GraphOps); + result.Set(kShardedVarDevice, new ShardedVarDevice); // find send/recv vars so that we can place the distributed training // related op in the place 0 @@ -289,11 +283,12 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(node); + int op_dev_id = GetOpDeviceID(result, node); if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { - var_name_on_devices_.emplace(n->Name(), op_dev_id); + graph->Get(kShardedVarDevice) + .emplace(n->Name(), op_dev_id); } } else { // This op runs on all devices, and its output may have parameter's @@ -330,7 +325,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( case BuildStrategy::ReduceStrategy::kReduce: cur_device_id = GetAppropriateDeviceID({g_name}); CreateReduceOp(&result, g_name, cur_device_id); - var_name_on_devices_.emplace(g_name, cur_device_id); + graph->Get(kShardedVarDevice) + .emplace(g_name, cur_device_id); bcast_var_name_set[cur_device_id].emplace(p_name); break; case BuildStrategy::ReduceStrategy::kAllReduce: @@ -416,16 +412,16 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, places_); #endif - result->Get("ops").emplace_back(op_handle); + result->Get(kGraphOps).emplace_back(op_handle); auto *in = - result->Get("vars").at(src_dev_id).at(p_name).back().get(); + result->Get(kGraphVars).at(src_dev_id).at(p_name).back().get(); op_handle->AddInput(in); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars").at(i).at(p_name); + auto &vars = result->Get(kGraphVars).at(i).at(p_name); auto *out_var = new VarHandle( result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(), i, p_name, p); @@ -437,7 +433,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const { - result->Get("ops").emplace_back( + result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), local_scopes_[dev_id], places_[dev_id])); CreateOpHandleIOs(result, node, dev_id); @@ -446,20 +442,20 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, const std::string &og) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new AllReduceOpHandle( + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new AllReduceOpHandle( + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars")[i][og]; + auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); @@ -475,20 +471,20 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ir::Graph *result, const std::vector &datas) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new DataBalanceOpHandle( + result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new DataBalanceOpHandle( + result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); for (const std::string &d_name : datas) { - auto &vars = result->Get("vars")[i][d_name]; + auto &vars = result->Get(kGraphVars)[i][d_name]; PADDLE_ENFORCE(!vars.empty()); op_handle->AddInput(vars.back().get()); auto var = new VarHandle( @@ -512,7 +508,8 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( return is_pg_once; } -int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { +int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, + ir::Node *node) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } @@ -525,15 +522,17 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(param_grad[1]); + int dev_id = GetVarDeviceID(graph, param_grad[1]); PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } -int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { - auto got = var_name_on_devices_.find(varname); - return got == var_name_on_devices_.end() ? -1 : got->second; +int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, + const std::string &varname) const { + auto &sharded_var_device = graph.Get(kShardedVarDevice); + auto got = sharded_var_device.find(varname); + return got == sharded_var_device.end() ? -1 : got->second; } void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { @@ -551,7 +550,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), local_scopes_.size(), local_scopes_[i], places_[i], communication_dev_ctx); - result->Get("ops").emplace_back(op_handle); + result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale // factor. So it does not depend on any other operators. @@ -572,7 +571,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->Get("ops").emplace_back( + result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); CreateOpHandleIOs(result, node, scope_idx); } @@ -582,25 +581,25 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new ReduceOpHandle( + result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new ReduceOpHandle( + result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars")[i][og]; + auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); } - auto &vars = result->Get("vars")[dst_dev_id][og]; + auto &vars = result->Get(kGraphVars)[dst_dev_id][og]; auto var = new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(), dst_dev_id, og, places_[dst_dev_id]); @@ -613,11 +612,11 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, // on it. void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op, const std::string &prev_op_name) const { - for (auto &prev_op : result->Get("ops")) { + for (auto &prev_op : result->Get(kGraphOps)) { if (prev_op->Name() == prev_op_name) { auto *dep_var = new DummyVarHandle(result->CreateControlDepVar()); prev_op->AddOutput(dep_var); - result->Get("dep_vars").emplace(dep_var); + result->Get(kGraphDepVars).emplace(dep_var); op->AddInput(dep_var); } } @@ -638,20 +637,23 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, if (node->Op()->Type() == "split_byref" || node->Op()->Type() == "split_selected_rows") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(input_var_names[0]); + op_dev_id = GetVarDeviceID(*result, input_var_names[0]); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { - op_dev_id = GetVarDeviceID(input_var_names[0]); + op_dev_id = GetVarDeviceID(*result, input_var_names[0]); for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } else { PADDLE_ENFORCE( @@ -665,7 +667,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, CreateComputationalOp(result, node, op_dev_id); if (node->Op()->Type() == "concat") { - ConnectOp(result, result->Get("ops").back().get(), + ConnectOp(result, result->Get(kGraphOps).back().get(), "fetch_barrier"); } } @@ -676,7 +678,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); + op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name()); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -691,7 +693,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } } else if (node->Op()->Type() == "recv") { @@ -701,7 +704,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } op_dev_id = GetAppropriateDeviceID(output_var_names); for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } else { // send_barrier and fetch_barrier op can be scheduled on device 0 @@ -711,18 +715,18 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", node->Op()->Type()); - result->Get("ops").emplace_back(new RPCOpHandle( + result->Get(kGraphOps).emplace_back(new RPCOpHandle( result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], node->Op()->Type(), places_[op_dev_id])); // TODO(panyx0718): This might not be needed anymore. if (node->Op()->Type() == "send_barrier") { - ConnectOp(result, result->Get("ops").back().get(), "send"); + ConnectOp(result, result->Get(kGraphOps).back().get(), "send"); } else if (node->Op()->Type() == "recv") { - ConnectOp(result, result->Get("ops").back().get(), + ConnectOp(result, result->Get(kGraphOps).back().get(), "send_barrier"); } else if (node->Op()->Type() == "fetch_barrier") { - ConnectOp(result, result->Get("ops").back().get(), "recv"); + ConnectOp(result, result->Get(kGraphOps).back().get(), "recv"); } else if (node->Op()->Type() == "send") { // do nothing } else { @@ -744,3 +748,11 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_pass, + paddle::framework::details::MultiDevSSAGraphBuilder) + .RequirePassAttr(paddle::framework::details::kLossVarName) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kParams) + .RequirePassAttr(paddle::framework::details::kLocalScopes) + .RequirePassAttr(paddle::framework::details::kStrategy); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 55076f227b5ab56d66b5053173c9e915da23b15f..099dbe5abef6458c4613c9f680440734f59cb6e2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -31,39 +31,27 @@ class Scope; namespace details { class MultiDevSSAGraphBuilder : public SSAGraphBuilder { - public: -#ifdef PADDLE_WITH_CUDA - MultiDevSSAGraphBuilder(const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - platform::NCCLContextMap *nccl_ctxs, - const BuildStrategy &strategy); -#else - MultiDevSSAGraphBuilder(const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - const BuildStrategy &strategy); -#endif - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; - int GetVarDeviceID(const std::string &varname) const override; private: void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, size_t device_id) const; + void Init() const; private: - std::string loss_var_name_; - const std::vector &places_; - const std::vector &local_scopes_; - std::unordered_set grad_names_; + mutable std::string loss_var_name_; + mutable std::vector places_; + mutable std::vector local_scopes_; + mutable std::unordered_set grad_names_; #ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap *nccl_ctxs_; + mutable platform::NCCLContextMap *nccl_ctxs_; #endif + int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const; + bool IsScaleLossOp(ir::Node *node) const; void CreateRPCOp(ir::Graph *result, ir::Node *node) const; @@ -97,7 +85,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &og, std::unordered_set *og_has_been_broadcast) const; - int GetOpDeviceID(ir::Node *node) const; + int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; @@ -113,9 +101,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector &var_names) const; private: - BuildStrategy strategy_; + mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; - mutable std::unordered_map var_name_on_devices_; mutable std::vector balance_vars_; void SetCommunicationContext(OpHandleBase *op_handle, diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index cbfbcb1c0cd24f16773f9633310166371600790c..5e87e0bf50b51d2b630aba06a5907dd721754d1f 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -40,6 +40,11 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy, std::vector local_scopes, std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); + + const ir::Graph& Graph() const override { + return underlying_executor_->Graph(); + } + FeedFetchList Run(const std::vector& fetch_tensors) override; private: diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 506e7eb35cd977869424223cb863dd64dbaa9d30..575532540a624afde5f6dab25b11e9eac93c6448 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -18,7 +18,7 @@ namespace paddle { namespace framework { namespace details { void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { - for (auto &var_map : graph->Get("vars")) { + for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { if (name_pair.second.size() <= 1) { continue; @@ -50,7 +50,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); read_op->AddOutput(dep_var); write_op->AddInput(dep_var); - graph->Get("dep_vars").emplace(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); } } } @@ -60,7 +60,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset) { - auto &var_holders = graph->Get("vars")[place_offset]; + auto &var_holders = graph->Get(kGraphVars)[place_offset]; auto &var_holder = var_holders[node->Name()]; VarHandle *var = nullptr; if (var_holder.empty()) { @@ -83,7 +83,8 @@ void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle, ir::Node *new_node, const platform::Place &place, size_t place_offset) { - auto &vars = graph->Get("vars")[place_offset][new_node->Name()]; + auto &vars = + graph->Get(kGraphVars)[place_offset][new_node->Name()]; size_t version = vars.size(); auto var = new VarHandle(new_node, version, place_offset, new_node->Name(), place); @@ -92,12 +93,12 @@ void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle, } void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) { - for (auto &op : graph->Get("ops")) { + for (auto &op : graph->Get(kGraphOps)) { if (!op->Outputs().empty()) { continue; } auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get("dep_vars").emplace(dummy_leaf); + graph->Get(kGraphDepVars).emplace(dummy_leaf); op->AddOutput(dummy_leaf); } } diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 2b4f31f2ff3444f909e3be5eb810ae6737e237b2..53a4ad003d51a27a044d7a142434545eca0d5965 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -39,21 +39,25 @@ namespace details { typedef std::vector< std::unordered_map>>> GraphVars; +const char kGraphVars[] = "vars"; // aux variables to represent dependency. Useful to resolve data hazard. typedef std::unordered_set> GraphDepVars; +const char kGraphDepVars[] = "dep_vars"; // all operators. NOTE that even we use a vector here, the operators is // unordered. typedef std::vector> GraphOps; +const char kGraphOps[] = "ops"; + +typedef std::unordered_map ShardedVarDevice; +const char kShardedVarDevice[] = "sharded_var_device"; class SSAGraphBuilder : public ir::Pass { public: SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {} - virtual int GetVarDeviceID(const std::string &var_name) const = 0; - DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc deleted file mode 100644 index b4b49d3de6da2e5fd7836668619e42d10bb6b35a..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" -#include -#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" -#include "paddle/fluid/framework/details/ssa_graph_checker.h" -#include "paddle/fluid/framework/details/ssa_graph_printer.h" - -namespace paddle { -namespace framework { -namespace details { -std::unique_ptr SSAGraphBuilderFactory::Create() { - std::unique_ptr res( -#ifdef PADDLE_WITH_CUDA - new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, - local_scopes_, nccl_ctxs_, strategy_) -#else - new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, - local_scopes_, strategy_) -#endif - ); // NOLINT - - if (!strategy_.debug_graphviz_path_.empty()) { - std::unique_ptr fout( - new std::ofstream(strategy_.debug_graphviz_path_)); - PADDLE_ENFORCE(fout->good()); - std::unique_ptr graphviz_printer( - new GraphvizSSAGraphPrinter()); - res.reset(new SSAGraghBuilderWithPrinter( - std::move(fout), std::move(graphviz_printer), std::move(res))); - } - res.reset(new SSAGraghBuilderWithChecker(std::move(res))); - - return res; -} -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h deleted file mode 100644 index 91a119de83ed3d1573803e48faf86c874eed98d6..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/details/build_strategy.h" -#include "paddle/fluid/framework/details/ssa_graph_builder.h" -#include "paddle/fluid/platform/place.h" - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/nccl_helper.h" -#endif - -namespace paddle { -namespace framework { -class Scope; -namespace details { - -class SSAGraphBuilderFactory { - public: - SSAGraphBuilderFactory(const std::vector& places, - const std::string& loss_var_name, - const std::unordered_set& param_names, - const std::vector& local_scopes, - const BuildStrategy& strategy) - : places_(places), - loss_var_name_(loss_var_name), - param_names_(param_names), - local_scopes_(local_scopes), - strategy_(strategy) { -#ifdef PADDLE_WITH_CUDA - nccl_ctxs_ = nullptr; -#endif - } - -#ifdef PADDLE_WITH_CUDA - void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) { - nccl_ctxs_ = nccl_ctxs; - } -#endif - - std::unique_ptr Create(); - - private: - std::vector places_; - std::string loss_var_name_; - std::unordered_set param_names_; - std::vector local_scopes_; - BuildStrategy strategy_; - -#ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap* nccl_ctxs_; -#endif -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc index 0438b096109a287366610d06ef2bd14c765a8f43..b9e1cda1f24810009bc74a7abdf0156f723a1755 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.cc +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -33,7 +33,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } }; - for (auto &var_map : graph->Get("vars")) { + for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { insert_pending_var(version_pair.get()); @@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } } - for (auto &var : graph->Get("dep_vars")) { + for (auto &var : graph->Get(kGraphDepVars)) { insert_pending_var(var.get()); } - for (auto &op : graph->Get("ops")) { + for (auto &op : graph->Get(kGraphOps)) { if (op->Inputs().empty()) { ready_ops.insert(op.get()); } else { @@ -85,3 +85,10 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_check_pass, + paddle::framework::details::SSAGraghBuilderWithChecker) + .RequireGraphAttr(paddle::framework::details::kGraphVars) + .RequireGraphAttr(paddle::framework::details::kGraphDepVars) + .RequireGraphAttr(paddle::framework::details::kGraphOps) + .RequireGraphAttr(paddle::framework::details::kShardedVarDevice); diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index 51ce6e5ecad755613551aa6525b5cfbe4a8933ef..0e861ecb236361992d9883e3bd0e679f7563b539 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -23,26 +23,14 @@ namespace framework { namespace details { class SSAGraghBuilderWithChecker : public SSAGraphBuilder { - public: - explicit SSAGraghBuilderWithChecker( - std::unique_ptr&& builder) - : builder_(std::move(builder)) {} - - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { - auto new_graph = builder_->Apply(std::move(graph)); - PADDLE_ENFORCE(IsValidGraph(new_graph.get())); - return new_graph; - } - - int GetVarDeviceID(const std::string& var_name) const override { - return builder_->GetVarDeviceID(var_name); + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; } bool IsValidGraph(const ir::Graph* graph) const; - - private: - std::unique_ptr builder_; }; } // namespace details diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index 8815ec89b23bc874471eefde5fa855cd2a4bde1f..96fffb7d9430cd00b3823ada9fbe9a65a6bd718c 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -32,7 +32,9 @@ class SSAGraphExecutor { virtual ~SSAGraphExecutor(); - virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0; + virtual const ir::Graph& Graph() const = 0; + + virtual FeedFetchList Run(const std::vector& fetch_tensors) = 0; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc index 20aab1464400aa9bb1bd6af11c06269c688a8308..ec3f31ab8d135efd2c77018e90cec46b25ca5e66 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.cc +++ b/paddle/fluid/framework/details/ssa_graph_printer.cc @@ -22,7 +22,7 @@ namespace details { template static inline void IterAllVar(const ir::Graph &graph, Callback callback) { - for (auto &each : graph.Get("vars")) { + for (auto &each : graph.Get(kGraphVars)) { for (auto &pair1 : each) { for (auto &pair2 : pair1.second) { callback(*pair2); @@ -30,7 +30,7 @@ static inline void IterAllVar(const ir::Graph &graph, Callback callback) { } } - for (auto &var : graph.Get("dep_vars")) { + for (auto &var : graph.Get(kGraphDepVars)) { callback(*var); } } @@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, }); size_t op_id = 0; - for (auto &op : graph.Get("ops")) { + for (auto &op : graph.Get(kGraphOps)) { std::string op_name = "op_" + std::to_string(op_id++); sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" << std::endl; @@ -81,3 +81,6 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_print_pass, + paddle::framework::details::SSAGraghBuilderWithPrinter); diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index a77c1bad3f15bca9064ded860696eb68b033b090..5eafd1805c3102dbd3cdfa68ee1495631c182b51 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -14,7 +14,9 @@ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/details/ssa_graph_builder.h" @@ -34,38 +36,15 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter { }; class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { - public: - SSAGraghBuilderWithPrinter(std::ostream& sout, - std::unique_ptr&& printer, - std::unique_ptr&& builder) - : printer_(std::move(printer)), - builder_(std::move(builder)), - stream_ref_(sout) {} - - SSAGraghBuilderWithPrinter(std::unique_ptr&& sout, - std::unique_ptr&& printer, - std::unique_ptr&& builder) - : printer_(std::move(printer)), - builder_(std::move(builder)), - stream_ptr_(std::move(sout)), - stream_ref_(*stream_ptr_) {} - - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { - auto new_graph = builder_->Apply(std::move(graph)); - printer_->Print(*new_graph, stream_ref_); - return new_graph; + std::unique_ptr fout( + new std::ofstream(Get("debug_graphviz_path"))); + PADDLE_ENFORCE(fout->good()); + Get("graph_printer").Print(*graph, *fout); + return graph; } - - int GetVarDeviceID(const std::string& var_name) const override { - return builder_->GetVarDeviceID(var_name); - } - - private: - std::unique_ptr printer_; - std::unique_ptr builder_; - std::unique_ptr stream_ptr_; - std::ostream& stream_ref_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c19f74476f9a1498a7d61f5faf204e9966aea155..e556c84b0219eba2b92c456c205e03947171626b 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -45,18 +45,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( std::unordered_set delayed_ops; // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_->Get("vars")) { + for (auto &var_map : graph_->Get(details::kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { InsertPendingVar(&pending_vars, &ready_vars, version_pair.get()); } } } - for (auto &var : graph_->Get("dep_vars")) { + for (auto &var : graph_->Get(details::kGraphDepVars)) { InsertPendingVar(&pending_vars, &ready_vars, var.get()); } - for (auto &op : graph_->Get("ops")) { + for (auto &op : graph_->Get(details::kGraphOps)) { if (op->Inputs().empty()) { // Special case, Op has no input. ready_ops.insert(op.get()); } else { @@ -83,7 +83,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Clean run context run_op_futures_.clear(); - exception_.reset(); + exception_holder_.Clear(); // Step 3. Execution while (!pending_vars.empty()) { @@ -103,23 +103,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto cur_ready_vars = ready_vars.PopAll(1, &timeout); if (timeout) { - std::unique_lock l(exception_mu_); - if (exception_) { - l.unlock(); + if (exception_holder_.ExceptionCatched()) { for (auto &run_op_future : run_op_futures_) { run_op_future.wait(); } - l.lock(); - std::exception *exp = exception_.get(); - if (dynamic_cast(exp)) { - auto e = *static_cast(exp); - throw e; - } else if (dynamic_cast(exp)) { - auto e = *static_cast(exp); - throw e; - } else { - LOG(FATAL) << "Unknown exception."; - } + exception_holder_.Throw(); } else { continue; } @@ -162,7 +150,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( std::unordered_map> fetched_vars; for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : graph_->Get("vars")) { + for (auto &var_map : graph_->Get(details::kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get()); @@ -229,14 +217,9 @@ void ThreadedSSAGraphExecutor::RunOp( ready_var_q->Extend(op->Outputs()); VLOG(10) << op << " " << op->Name() << "Signal posted"; } catch (platform::EOFException ex) { - std::lock_guard l(exception_mu_); - // EOFException will not cover up existing EnforceNotMet. - if (exception_.get() == nullptr) { - exception_.reset(new platform::EOFException(ex)); - } + exception_holder_.Catch(ex); } catch (platform::EnforceNotMet ex) { - std::lock_guard l(exception_mu_); - exception_.reset(new platform::EnforceNotMet(ex)); + exception_holder_.Catch(ex); } catch (...) { LOG(FATAL) << "Unknown exception catched"; } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 3d67daa45e20fdea52689684397ad01f2f4cd783..9135c1f5d435d5e2c60eb90c80803361aa31a3c4 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,6 +24,7 @@ #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/blocking_queue.h" +#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/ssa_graph_executor.h" @@ -42,6 +43,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { const std::vector &places, std::unique_ptr &&graph); + const ir::Graph &Graph() const override { return *graph_; } // Run a SSAGraph by a thread pool // Use topological sort algorithm FeedFetchList Run(const std::vector &fetch_tensors) override; @@ -58,8 +60,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; - std::mutex exception_mu_; - std::unique_ptr exception_; + ExceptionHolder exception_holder_; std::atomic running_ops_; void InsertPendingOp(std::unordered_map *pending_ops, diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 6447452ae58344273fe569c91168c7c95a901c8d..bf7d76a8a6e173e648cea5aaba9b7202d787173b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,6 +1,9 @@ cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) -cc_library(pass SRCS pass.cc DEPS graph node) -cc_test(graph_test SRCS graph_test.cc DEPS graph op_registry) -cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph_helper op_registry) +cc_library(pass SRCS pass.cc DEPS graph node graph_helper) +cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) + +cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) +cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) +cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 4f59ec82a7d1217621c95d9a4a433a9af43e95da..c9d55fbf525a1a476ac469e8e57462169a7db2da 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -40,14 +40,21 @@ class Graph { attr_dels_.clear(); } + bool Has(const std::string &attr_name) const { + return attrs_.find(attr_name) != attrs_.end(); + } + template AttrType &Get(const std::string &attr_name) const { + PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", + attr_name); return *boost::any_cast(attrs_.at(attr_name)); } template void Set(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph", + attr_name); attrs_[attr_name] = attr; attr_dels_[attr_name] = [attr, attr_name]() { VLOG(3) << "deleting " << attr_name; diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..8cb812d1388bf74d173a4dc7a99561e730f8e95a --- /dev/null +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { +static const char kGraphVizPath[] = "graph_viz_path"; + +std::unique_ptr GraphVizPass::ApplyImpl( + std::unique_ptr graph) const { + const std::string graph_viz_path = Get(kGraphVizPath); + std::unique_ptr fout(new std::ofstream(graph_viz_path)); + PADDLE_ENFORCE(fout->good()); + std::ostream& sout = *fout; + + size_t var_id = 0; + std::unordered_map vars; + + sout << "digraph G {\n"; + + for (const ir::Node* n : graph->Nodes()) { + if (n->NodeType() != ir::Node::Type::kVariable) continue; + size_t cur_var_id = var_id++; + vars[n] = cur_var_id; + + sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]" + << std::endl; + } + + size_t op_id = 0; + for (const ir::Node* n : graph->Nodes()) { + if (n->NodeType() != ir::Node::Type::kOperation) continue; + std::string op_name = "op_" + std::to_string(op_id++); + sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]" + << std::endl; + for (auto in : n->inputs) { + std::string var_name = "var_" + std::to_string(vars[in]); + sout << var_name << " -> " << op_name << std::endl; + } + + for (auto out : n->outputs) { + std::string var_name = "var_" + std::to_string(vars[out]); + sout << op_name << " -> " << var_name << std::endl; + } + } + + sout << "}\n"; + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) + .RequirePassAttr(paddle::framework::ir::kGraphVizPath); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..1fd8c8a26e9581ccf605d4271a49ec2e90d8b997 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class GraphVizPass : public Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index c05d7d0bb54c8ba5938e08f7e8dace8f607d7b89..d7158eba62686be57499df697466797e4034ea8f 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -13,7 +13,34 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { -namespace framework {} // namespace framework +namespace framework { +namespace ir { +std::unique_ptr Pass::Apply(std::unique_ptr graph) const { + PADDLE_ENFORCE(!applied_, "Pass can only Apply() once."); + PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); + for (const std::string& attr : required_pass_attrs_) { + PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), + "Required pass atrribute %s not set.", attr); + } + for (const std::string& attr : required_graph_attrs_) { + PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.", + attr); + } + auto applied_graph = ApplyImpl(std::move(graph)); + // TODO(panyx0718): Add more verifications. + PADDLE_ENFORCE(!HasCircle(*applied_graph), + "Illegal Pass. Generated graph shouldn't has cycle."); + applied_ = true; + return applied_graph; +} + +PassRegistry& PassRegistry::Instance() { + static PassRegistry g_pass_info_map; + return g_pass_info_map; +} +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index f52ba788d55ddb9ed27baa3f6ff0a97e52370fe0..0f14083d259172f5b5f1ed80c7d38312d711beb5 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -14,21 +14,187 @@ limitations under the License. */ #pragma once +#include +#include +#include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { namespace ir { +template +struct PassRegistrar; class Pass { public: Pass() = default; - virtual ~Pass() {} + virtual ~Pass() { + for (auto &attr : attrs_) { + if (attr_dels_.find(attr.first) != attr_dels_.end()) { + attr_dels_[attr.first](); + } + } + attrs_.clear(); + attr_dels_.clear(); + } + + std::unique_ptr Apply(std::unique_ptr graph) const; + + // Get a reference to the attributed previously set. + template + AttrType &Get(const std::string &attr_name) const { + PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), + "%s attr not registered for pass.", attr_name); + return *boost::any_cast(attrs_.at(attr_name)); + } + + // Set a pointer to the attribute. Pass takes ownership of the attribute. + template + void Set(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass", + attr_name); + attrs_[attr_name] = attr; + attr_dels_[attr_name] = [attr, attr_name]() { + VLOG(3) << "deleting " << attr_name; + delete attr; + }; + } + + // Set a pointer to the attribute. Pass doesn't take ownership. Caller + // should delete the attribute. + template + void SetNotOwned(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + attrs_[attr_name] = attr; + } + + protected: + virtual std::unique_ptr ApplyImpl( + std::unique_ptr graph) const = 0; + + private: + template + friend struct PassRegistrar; + + void RegisterRequiredPassAttrs(const std::unordered_set &attrs) { + required_pass_attrs_.insert(attrs.begin(), attrs.end()); + } + + void RegisterRequiredGraphAttrs( + const std::unordered_set &attrs) { + required_graph_attrs_.insert(attrs.begin(), attrs.end()); + } + + mutable bool applied_{false}; + std::unordered_set required_pass_attrs_; + std::unordered_set required_graph_attrs_; + std::map attrs_; + std::map> attr_dels_; +}; + +using PassCreator = std::function()>; + +class Registrar { + public: + // In our design, various kinds of passes, + // have their corresponding registry and registrar. The action of + // registration is in the constructor of a global registrar variable, which + // are not used in the code that calls package framework, and would + // be removed from the generated binary file by the linker. To avoid such + // removal, we add Touch to all registrar classes and make USE_PASS macros to + // call this method. So, as long as the callee code calls USE_PASS, the global + // registrar variable won't be removed by the linker. + void Touch() {} +}; - virtual std::unique_ptr Apply(std::unique_ptr graph) const = 0; +class PassRegistry { + public: + static PassRegistry &Instance(); + + bool Has(const std::string &pass_type) const { + return map_.find(pass_type) != map_.end(); + } + + void Insert(const std::string &pass_type, const PassCreator &pass_creator) { + PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type); + map_.insert({pass_type, pass_creator}); + } + + std::unique_ptr Get(const std::string &pass_type) const { + PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered", + pass_type); + return map_.at(pass_type)(); + } + + private: + PassRegistry() = default; + std::unordered_map map_; + + DISABLE_COPY_AND_ASSIGN(PassRegistry); }; + +template +struct PassRegistrar : public Registrar { + explicit PassRegistrar(const char *pass_type) { + PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), + "'%s' is registered more than once.", pass_type); + PassRegistry::Instance().Insert( + pass_type, [this]() -> std::unique_ptr { + std::unique_ptr pass(new PassType()); + pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); + pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); + return pass; + }); + } + + PassRegistrar &RequirePassAttr(const std::string &attr) { + required_pass_attrs_.insert(attr); + return *this; + } + + PassRegistrar &RequireGraphAttr(const std::string &attr) { + required_graph_attrs_.insert(attr); + return *this; + } + + private: + std::unordered_set required_pass_attrs_; + std::unordered_set required_graph_attrs_; +}; + +#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +// Register a new pass that can be applied on the IR. +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } \ + static ::paddle::framework::ir::PassRegistrar \ + &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \ + __pass_registrar_##pass_type##__ + +#define USE_PASS(pass_type) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __use_pass_itself_##pass_type, \ + "USE_PASS must be called in global namespace"); \ + extern int TouchPassRegistrar_##pass_type(); \ + static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \ + TouchPassRegistrar_##pass_type() + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b5011412ed39e033a7a65921e9c64ce2d54c638 --- /dev/null +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass.h" +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { +void BuildCircleGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + + o2->outputs.push_back(v2); + o1->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o1); +} + +class TestPass : public Pass { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + graph->Set("copy_test_pass_attr", new int); + graph->Set("copy_test_graph_attr", new int); + + int test_pass_attr = this->Get("test_pass_attr"); + graph->Get("copy_test_pass_attr") = test_pass_attr + 1; + + int test_graph_attr = graph->Get("test_graph_attr"); + graph->Get("copy_test_graph_attr") = test_graph_attr + 1; + return graph; + } +}; + +TEST(PassTest, TestPassAttrCheck) { + ProgramDesc prog; + auto pass = PassRegistry::Instance().Get("test_pass"); + std::unique_ptr graph(new Graph(prog)); + std::string exception; + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos); + + int val = 1; + graph.reset(new Graph(prog)); + pass->SetNotOwned("test_pass_attr", &val); + + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos); + + graph.reset(new Graph(prog)); + graph->Set("test_graph_attr", new int); + graph->Get("test_graph_attr") = 1; + graph = pass->Apply(std::move(graph)); + ASSERT_EQ(graph->Get("copy_test_pass_attr"), 2); + ASSERT_EQ(graph->Get("copy_test_graph_attr"), 2); + + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos); + + pass = PassRegistry::Instance().Get("test_pass"); + pass->SetNotOwned("test_pass_attr", &val); + graph.reset(new Graph(prog)); + BuildCircleGraph(graph.get()); + graph->Set("test_graph_attr", new int); + graph->Get("test_graph_attr") = 2; + try { + auto tmp = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(test_pass, paddle::framework::ir::TestPass) + .RequirePassAttr("test_pass_attr") + .RequireGraphAttr("test_graph_attr"); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 02c836bea194553bb9c4bc5677cc408dd302e9ce..b5f01a9a2b76472063658f1a051a2ee3c65559b7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -19,19 +19,80 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" #endif #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" +#include "paddle/fluid/framework/details/ssa_graph_checker.h" +#include "paddle/fluid/framework/details/ssa_graph_printer.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { +std::unique_ptr ApplyParallelExecutorPass( + const ProgramDesc &main_program, const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, const bool use_cuda, +#ifdef PADDLE_WITH_CUDA + const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) { +#else + const BuildStrategy &strategy) { +#endif + // Convert the program to graph. + std::unique_ptr graph(new ir::Graph(main_program)); + + // Apply a graph viz pass to record a graph. + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + graph = viz_pass->Apply(std::move(graph)); + } + + // Convert graph to run on multi-devices. + auto multi_device_pass = + ir::PassRegistry::Instance().Get("multi_device_pass"); + multi_device_pass->SetNotOwned>("places", + &places); + multi_device_pass->SetNotOwned("loss_var_name", + &loss_var_name); + multi_device_pass->SetNotOwned>( + "params", ¶m_names); + multi_device_pass->SetNotOwned>("local_scopes", + &local_scopes); + multi_device_pass->SetNotOwned("strategy", &strategy); + +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + multi_device_pass->SetNotOwned("nccl_ctxs", nctx); +#endif + graph = multi_device_pass->Apply(std::move(graph)); + + // Apply a graph print pass to record a graph with device info. + if (!strategy.debug_graphviz_path_.empty()) { + auto multi_device_print_pass = + ir::PassRegistry::Instance().Get("multi_device_print_pass"); + multi_device_print_pass->SetNotOwned( + "debug_graphviz_path", &strategy.debug_graphviz_path_); + multi_device_print_pass->Set( + "graph_printer", new details::GraphvizSSAGraphPrinter); + graph = multi_device_print_pass->Apply(std::move(graph)); + } + + // Verify that the graph is correct for multi-device executor. + auto multi_device_check_pass = + ir::PassRegistry::Instance().Get("multi_device_check_pass"); + graph = multi_device_check_pass->Apply(std::move(graph)); + return graph; +} + class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) @@ -119,21 +180,19 @@ ParallelExecutor::ParallelExecutor( var_infos.back().persistable_ = var->Persistable(); } - // Step 3. Convert main_program to SSA form and dependency graph. Also, insert - // ncclOp - details::SSAGraphBuilderFactory builder_factory( - member_->places_, loss_var_name, params, member_->local_scopes_, - build_strategy); - if (member_->use_cuda_) { +// Step 3. Convert main_program to SSA form and dependency graph. Also, insert +// ncclOp #ifdef PADDLE_WITH_CUDA - builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); + std::unique_ptr graph = ApplyParallelExecutorPass( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, build_strategy, + member_->nccl_ctxs_.get()); #else - PADDLE_THROW("Not compiled with CUDA."); + std::unique_ptr graph = ApplyParallelExecutorPass( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, build_strategy); #endif - } - builder_ = builder_factory.Create(); - std::unique_ptr graph(new ir::Graph(main_program)); - graph = builder_->Apply(std::move(graph)); + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -146,11 +205,18 @@ void ParallelExecutor::BCastParamsToDevices( // the initializing bcast, all vars would be bcast from device(0), // otherwise // bcast from the specified device. - bool initializing = builder_.get() == nullptr ? true : false; - + bool initializing = member_->executor_ ? false : true; for (auto &var : vars) { - int var_dev_id = - builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var); + int var_dev_id = -1; + if (member_->executor_) { + auto &sharded_var_device = + member_->executor_->Graph().Get( + details::kShardedVarDevice); + if (sharded_var_device.find(var) != sharded_var_device.end()) { + var_dev_id = sharded_var_device.at(var); + } + } + if (!initializing && var_dev_id == -1) continue; framework::Variable *main_var = nullptr; @@ -286,3 +352,8 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle + +USE_PASS(graph_viz_pass); +USE_PASS(multi_device_pass); +USE_PASS(multi_device_check_pass); +USE_PASS(multi_device_print_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ffb9934a2d702b2bf6db7ad75a6bf9867e1e9901..d624956acde86cefc4ec1dec80df3738bcf1d8be 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -70,7 +70,6 @@ class ParallelExecutor { private: ParallelExecutorPrivate *member_; - std::unique_ptr builder_; }; } // namespace framework diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt index 67d355d10d3c9e11b59c9ce9d208826523095459..27fe575cb6167a726ff92a8f3d2e47b6f536ba39 100644 --- a/paddle/fluid/inference/analysis/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/CMakeLists.txt @@ -6,9 +6,11 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph tensorrt_subgraph_node_mark_pass.cc analyzer.cc helper.cc + model_store_pass.cc DEPS framework_proto proto_desc) cc_test(test_node SRCS node_tester.cc DEPS analysis) cc_test(test_dot SRCS dot_tester.cc DEPS analysis) +cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis) set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests) @@ -40,3 +42,4 @@ inference_analysis_test(test_tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass_ inference_analysis_test(test_pass_manager SRCS pass_manager_tester.cc) inference_analysis_test(test_tensorrt_subgraph_node_mark_pass SRCS tensorrt_subgraph_node_mark_pass_tester.cc) inference_analysis_test(test_analyzer SRCS analyzer_tester.cc) +inference_analysis_test(test_model_store_pass SRCS model_store_pass_tester.cc) diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc index b3a1075e5adf4a24bf32017574c061f36c46ba8c..98bdfcc00b9f0e8f40dfc92e4021b2bd6fb19313 100644 --- a/paddle/fluid/inference/analysis/analyzer.cc +++ b/paddle/fluid/inference/analysis/analyzer.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h" #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" +#include "paddle/fluid/inference/analysis/model_store_pass.h" #include "paddle/fluid/inference/analysis/pass_manager.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h" #include "paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h" @@ -29,6 +30,9 @@ DEFINE_bool(inference_analysis_enable_tensorrt_subgraph_engine, false, DEFINE_string(inference_analysis_graphviz_log_root, "./", "Graphviz debuger for data flow graphs."); +DEFINE_string(inference_analysis_output_storage_path, "", + "optimized model output path"); + namespace inference { namespace analysis { @@ -47,6 +51,9 @@ class DfgPassManagerImpl final : public DfgPassManager { AddPass("tensorrt-subgraph", new TensorRTSubGraphPass(trt_teller)); } AddPass("data-flow-graph-to-fluid", new DataFlowGraphToFluidPass); + if (!FLAGS_inference_analysis_output_storage_path.empty()) { + AddPass("model-store-pass", new ModelStorePass); + } } std::string repr() const override { return "dfg-pass-manager"; } diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h index 0132bf5b9c6552391aaa19542669487f42b685a7..c82fdfff86c91b4e07e3c1b80987d3d8d796ad23 100644 --- a/paddle/fluid/inference/analysis/analyzer.h +++ b/paddle/fluid/inference/analysis/analyzer.h @@ -16,28 +16,23 @@ limitations under the License. */ /* * This file contains Analyzer, an class that exposed as a library that analyze - * and optimize - * Fluid ProgramDesc for inference. Similar to LLVM, it has multiple flags to - * control whether - * an process is applied on the program. + * and optimize Fluid ProgramDesc for inference. Similar to LLVM, it has + * multiple flags to + * control whether an process is applied on the program. * * The processes are called Passes in analysis, the Passes are placed in a - * pipeline, the first - * Pass is the FluidToDataFlowGraphPass which transforms a Fluid ProgramDesc to - * a data flow - * graph, the last Pass is DataFlowGraphToFluidPass which transforms a data flow - * graph to a - * Fluid ProgramDesc. The passes in the middle of the pipeline can be any Passes - * which take a - * node or data flow graph as input. + * pipeline, the first Pass is the FluidToDataFlowGraphPass which transforms a + * Fluid ProgramDesc to + * a data flow graph, the last Pass is DataFlowGraphToFluidPass which transforms + * a data flow graph to a Fluid ProgramDesc. The passes in the middle of the + * pipeline can be any Passes + * which take a node or data flow graph as input. * * The Analyzer can be used in two methods, the first is a executable file which - * can be used to - * pre-process the inference model and can be controlled by passing difference - * command flags; + * can be used to pre-process the inference model and can be controlled by + * passing difference command flags; * the other way is to compose inside the inference API as a runtime pre-process - * phase in the - * inference service. + * phase in the inference service. */ #include @@ -50,6 +45,7 @@ namespace paddle { // flag if not available. DECLARE_bool(inference_analysis_enable_tensorrt_subgraph_engine); DECLARE_string(inference_analysis_graphviz_log_root); +DECLARE_string(inference_analysis_output_storage_path); namespace inference { namespace analysis { diff --git a/paddle/fluid/inference/analysis/analyzer_main.cc b/paddle/fluid/inference/analysis/analyzer_main.cc new file mode 100644 index 0000000000000000000000000000000000000000..5e1fe3eb797cdced56a61aa2db0c3d18601824f8 --- /dev/null +++ b/paddle/fluid/inference/analysis/analyzer_main.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file implements analysizer -- an executation help to analyze and + * optimize trained model. + */ +#include "paddle/fluid/inference/analysis/analyzer.h" +#include +#include + +int main(int argc, char** argv) { + google::ParseCommandLineFlags(&argc, &argv, true); + using paddle::inference::analysis::Analyzer; + using paddle::inference::analysis::Argument; + + Argument argument; + Analyzer analyzer; + analyzer.Run(&argument); + + return 0; +} diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 25a440e7e71fddb38cc515f99d15231675a8172e..24bfb3993cf569561980006b6627b56327dd0f67 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -20,14 +20,18 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, analysis_without_tensorrt) { +TEST(Analyzer, analysis_without_tensorrt) { FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = false; + Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } -TEST_F(DFG_Tester, analysis_with_tensorrt) { +TEST(Analyzer, analysis_with_tensorrt) { FLAGS_inference_analysis_enable_tensorrt_subgraph_engine = true; + Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); Analyzer analyser; analyser.Run(&argument); } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 6d316f20bff7a68754b0afec6463bd5d7579227f..9e1c2e45865a56efb60d4ec632ff3c52e23fedde 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -36,6 +36,16 @@ namespace analysis { * All the fields should be registered here for clearness. */ struct Argument { + Argument() = default; + explicit Argument(const std::string& fluid_model_dir) + : fluid_model_dir(new std::string(fluid_model_dir)) {} + // The directory of the trained model. + std::unique_ptr fluid_model_dir; + // The path of `__model__` and `param`, this is used when the file name of + // model and param is changed. + std::unique_ptr fluid_model_program_path; + std::unique_ptr fluid_model_param_path; + // The graph that process by the Passes or PassManagers. std::unique_ptr main_dfg; @@ -44,6 +54,9 @@ struct Argument { // The processed program desc. std::unique_ptr transformed_program_desc; + + // The output storage path of ModelStorePass. + std::unique_ptr model_output_store_path; }; #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h index 1c60d5de21538043962cc58a6f508aea635fe8c4..bc1875f4d851c5d28d290357d94528fe3303f631 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph.h +++ b/paddle/fluid/inference/analysis/data_flow_graph.h @@ -36,6 +36,8 @@ namespace analysis { /* * DataFlowGraph - A container of Value and Function Nodes. + * + * This is the base graph for any other type of graphs, such as SSA or CFG. */ struct DataFlowGraph { NodeMap nodes; diff --git a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc index 7912f8d7f17ae3c79e8f73f36b7095fd52c9ac86..a881262665f156812da9e1576aa29b05fc398499 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc @@ -20,7 +20,7 @@ namespace inference { namespace analysis { TEST(DataFlowGraph, BFS) { - auto desc = LoadProgramDesc(); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); dfg.Build(); @@ -44,7 +44,7 @@ TEST(DataFlowGraph, BFS) { } TEST(DataFlowGraph, DFS) { - auto desc = LoadProgramDesc(); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); dfg.Build(); GraphTraits trait(&dfg); diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc index d8fc5e580a98f76233f01fdc4d7987311f78ee45..4ef381db295b986b91173a728b6d98640f6f4f51 100644 --- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc +++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass_tester.cc @@ -26,21 +26,21 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, Test) { - DataFlowGraph graph; +TEST(DataFlowGraph, Test) { + Argument argument(FLAGS_inference_model_dir); FluidToDataFlowGraphPass pass0; DataFlowGraphToFluidPass pass1; ASSERT_TRUE(pass0.Initialize(&argument)); ASSERT_TRUE(pass1.Initialize(&argument)); - pass0.Run(&graph); - pass1.Run(&graph); + pass0.Run(argument.main_dfg.get()); + pass1.Run(argument.main_dfg.get()); pass0.Finalize(); pass1.Finalize(); - LOG(INFO) << graph.nodes.size(); + LOG(INFO) << argument.main_dfg->nodes.size(); } }; // namespace analysis diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc index 65842b1e850953e77e3d4d28416609be271af9f1..928be7917047382d9b86294f6039b26b0ebf6f49 100644 --- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc +++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass_tester.cc @@ -23,12 +23,18 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, dfg_graphviz_draw_pass_tester) { - auto dfg = ProgramDescToDFG(*argument.origin_program_desc); +TEST(DFG_GraphvizDrawPass, dfg_graphviz_draw_pass_tester) { + Argument argument(FLAGS_inference_model_dir); + FluidToDataFlowGraphPass pass0; + ASSERT_TRUE(pass0.Initialize(&argument)); + pass0.Run(argument.main_dfg.get()); + + // auto dfg = ProgramDescToDFG(*argument.origin_program_desc); + DFG_GraphvizDrawPass::Config config("./", "test"); DFG_GraphvizDrawPass pass(config); pass.Initialize(&argument); - pass.Run(&dfg); + pass.Run(argument.main_dfg.get()); // test content std::ifstream file("./0-graph_test.dot"); diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc index 496921db9eabce1b1e40c7cb13089446ca93321c..511631d3e067f14bc1230d9e4b4d92dbe604e1d4 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include @@ -25,8 +26,20 @@ namespace analysis { bool FluidToDataFlowGraphPass::Initialize(Argument *argument) { ANALYSIS_ARGUMENT_CHECK_FIELD(argument); - ANALYSIS_ARGUMENT_CHECK_FIELD(argument->origin_program_desc); - PADDLE_ENFORCE(argument); + if (argument->origin_program_desc) { + LOG(WARNING) << "argument's origin_program_desc is already set, might " + "duplicate called"; + } + if (!argument->fluid_model_program_path) { + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_dir); + argument->fluid_model_program_path.reset( + new std::string(*argument->fluid_model_dir + "/__model__")); + } + ANALYSIS_ARGUMENT_CHECK_FIELD(argument->fluid_model_program_path); + auto program = LoadProgramDesc(*argument->fluid_model_program_path); + argument->origin_program_desc.reset( + new framework::proto::ProgramDesc(program)); + if (!argument->main_dfg) { argument->main_dfg.reset(new DataFlowGraph); } @@ -40,6 +53,8 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { PADDLE_ENFORCE(graph); PADDLE_ENFORCE(desc_); // insert vars + // The `var2id` keeps a map from a variable's name to its Node-id, the Node-id + // will keep updating to its latest alias during the graph-building. std::unordered_map var2id; auto &main_block = desc_->blocks(framework::kRootBlockIndex); for (int i = 0; i < main_block.vars_size(); i++) { @@ -51,6 +66,15 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { var2id[var.name()] = v->id(); } + // The variables in a SSA can only write once, so if a variable is written + // multiple times(quite common in our ProgramDesc design), multiple alias + // Nodes of this variable will be created, and each will just write once. + + // An set that keep all the names of the variables(the original, not alias) + // that have been written(as outputs). Once an Op's output variable hit the + // set, it should create a new alias and update the global alias for this + // variable. And that make a Data Flow Graph a SSA. + std::unordered_set unique_written_vars; for (int i = 0; i < main_block.ops_size(); i++) { const auto &op = main_block.ops(i); auto *o = graph->nodes.Create(Node::Type::kFunction); @@ -62,33 +86,33 @@ void FluidToDataFlowGraphPass::Run(DataFlowGraph *graph) { o->SetPbMsg(op.SerializeAsString()); // set inputs and outputs - std::unordered_set inlinks; for (int j = 0; j < op.inputs_size(); j++) { auto &in_var = op.inputs(j); for (int k = 0; k < in_var.arguments_size(); k++) { auto *in = graph->nodes.GetMutable(var2id.at(in_var.arguments(k))); in->outlinks.push_back(o); o->inlinks.push_back(in); - inlinks.insert(in); } } for (int j = 0; j < op.outputs_size(); j++) { auto &out_var = op.outputs(j); for (int k = 0; k < out_var.arguments_size(); k++) { auto *out = graph->nodes.GetMutable(var2id[out_var.arguments(k)]); - if (inlinks.count(out)) { + if (unique_written_vars.count(out)) { // Loop found, for example, a = op(a), use SSA, change to a1 = op(a). auto *out_alias = graph->nodes.Create(Node::Type::kValue); out_alias->SetName(out->name()); out_alias->SetPbDesc(out->pb_desc()); out_alias->SetPbMsg(out->pb_msg()); - var2id[out_alias->name()] = out_alias->id(); // update a -> a0 + var2id[out_alias->name()] = + out_alias->id(); // update variable's alias Node LOG(INFO) << "loop found in graph, create SSA alias node [" << out_alias->repr() << "] for [" << out->repr() << "]"; out = out_alias; } out->inlinks.push_back(o); o->outlinks.push_back(out); + unique_written_vars.insert(out); } } } diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h index da8463b63bd0bb1633bfcb9d7d41a884ddd632c7..fb948bf2242abcbc1e841fd3b8457e63358782c5 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h @@ -30,7 +30,7 @@ namespace inference { namespace analysis { /* - * Transform a FluidDesc to a data flow graph. + * Transform a FluidDesc to a SSA. */ class FluidToDataFlowGraphPass final : public DataFlowGraphPass { public: diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc index dadb84059d21adab44159a6145b345460663cb96..d218dcd05015aa4636c16569de4addf4936c8cd5 100644 --- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass_tester.cc @@ -21,8 +21,9 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, Init) { +TEST(FluidToDataFlowGraphPass, Test) { FluidToDataFlowGraphPass pass; + Argument argument(FLAGS_inference_model_dir); pass.Initialize(&argument); pass.Run(argument.main_dfg.get()); // Analysis is sensitive to ProgramDesc, careful to change the original model. diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index f1064cd20f28092d80d3fd23a862da080b6cc2f3..a0f912b251d5ea29594a7f601d5b2bce91201790 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -136,6 +137,20 @@ static void ExecShellCommand(const std::string &cmd, std::string *message) { } } +static framework::proto::ProgramDesc LoadProgramDesc( + const std::string &model_path) { + std::ifstream fin(model_path, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(fin.is_open(), "Cannot open file %s", model_path); + fin.seekg(0, std::ios::end); + std::string buffer(fin.tellg(), ' '); + fin.seekg(0, std::ios::beg); + fin.read(&buffer[0], buffer.size()); + fin.close(); + framework::proto::ProgramDesc program_desc; + program_desc.ParseFromString(buffer); + return program_desc; +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.cc b/paddle/fluid/inference/analysis/model_store_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..db7be3c0cde12c90ca698c13d4f3564d8b66ee40 --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/model_store_pass.h" +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/argument.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void ModelStorePass::Run(DataFlowGraph *x) { + if (!argument_->fluid_model_param_path) { + PADDLE_ENFORCE_NOT_NULL(argument_->fluid_model_dir); + argument_->fluid_model_param_path.reset( + new std::string(*argument_->fluid_model_dir + "param")); + } + PADDLE_ENFORCE_NOT_NULL(argument_->model_output_store_path); + // Directly copy param file to destination. + std::stringstream ss; + // NOTE these commands only works on linux. + ss << "mkdir -p " << *argument_->model_output_store_path; + LOG(INFO) << "run command: " << ss.str(); + PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); + ss.str(""); + + ss << "cp " << *argument_->fluid_model_dir << "/*" + << " " << *argument_->model_output_store_path; + LOG(INFO) << "run command: " << ss.str(); + PADDLE_ENFORCE_EQ(system(ss.str().c_str()), 0); + + // Store program + PADDLE_ENFORCE_NOT_NULL(argument_->transformed_program_desc, + "program desc is not transformed, should call " + "DataFlowGraphToFluidPass first."); + const std::string program_output_path = + *argument_->model_output_store_path + "/__model__"; + std::ofstream file(program_output_path, std::ios::binary); + PADDLE_ENFORCE(file.is_open(), "failed to open %s to write.", + program_output_path); + const std::string serialized_message = + argument_->transformed_program_desc->SerializeAsString(); + file.write(serialized_message.c_str(), serialized_message.size()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass.h b/paddle/fluid/inference/analysis/model_store_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..713e8783eac3e9294dd22622e42deb50fd432082 --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* + * This file defines ModelStorePass, which store the runtime DFG to a Paddle + * model in the disk, and that model can be reloaded for prediction. + */ + +#include "paddle/fluid/inference/analysis/pass.h" + +namespace paddle { +namespace inference { +namespace analysis { + +class ModelStorePass : public DataFlowGraphPass { + public: + bool Initialize(Argument* argument) override { + if (!argument) { + LOG(ERROR) << "invalid argument"; + return false; + } + argument_ = argument; + return true; + } + + void Run(DataFlowGraph* x) override; + + std::string repr() const override { return "DFG-store-pass"; } + std::string description() const override { + return R"DD(This file defines ModelStorePass, which store the runtime DFG to a Paddle + model in the disk, and that model can be reloaded for prediction again.)DD"; + } + + private: + Argument* argument_{nullptr}; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/model_store_pass_tester.cc b/paddle/fluid/inference/analysis/model_store_pass_tester.cc new file mode 100644 index 0000000000000000000000000000000000000000..5f3526dd504e77e58d79b4f675db86a22fd0f26b --- /dev/null +++ b/paddle/fluid/inference/analysis/model_store_pass_tester.cc @@ -0,0 +1,43 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/analysis/model_store_pass.h" + +#include +#include +#include "paddle/fluid/inference/analysis/analyzer.h" + +namespace paddle { +namespace inference { +namespace analysis { + +DEFINE_string(inference_model_dir, "", "Model path"); + +TEST(DFG_StorePass, test) { + Analyzer analyzer; + Argument argument(FLAGS_inference_model_dir); + argument.model_output_store_path.reset( + new std::string("./_dfg_store_pass_tmp")); + // disable storage in alalyzer + FLAGS_inference_analysis_output_storage_path = ""; + analyzer.Run(&argument); + + ModelStorePass pass; + pass.Initialize(&argument); + pass.Run(argument.main_dfg.get()); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/pass.h b/paddle/fluid/inference/analysis/pass.h index 6b4dbb3bb5ddd9f15f26758beef1d1b5bbf49142..6806f9ff7dada2c1e2328e1ffbfd225afefcf474 100644 --- a/paddle/fluid/inference/analysis/pass.h +++ b/paddle/fluid/inference/analysis/pass.h @@ -50,6 +50,7 @@ class Pass { // Create a debugger Pass that draw the DFG by graphviz toolkit. virtual Pass *CreateGraphvizDebugerPass() const { return nullptr; } + virtual void Run() { LOG(FATAL) << "not valid"; } // Run on a single Node. virtual void Run(Node *x) { LOG(FATAL) << "not valid"; } // Run on a single Function. diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc index dac1c509d728114bd24a2ea1150c407646026fd4..13423e4837e12a96e7a5dfc9ca3f59bf8b14746a 100644 --- a/paddle/fluid/inference/analysis/pass_manager_tester.cc +++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc @@ -56,7 +56,7 @@ class TestNodePass final : public NodePass { std::string description() const override { return "some doc"; } }; -TEST_F(DFG_Tester, DFG_pass_manager) { +TEST(PassManager, DFG_pass_manager) { TestDfgPassManager manager; DFG_GraphvizDrawPass::Config config("./", "dfg.dot"); @@ -64,12 +64,15 @@ TEST_F(DFG_Tester, DFG_pass_manager) { manager.Register("graphviz", new DFG_GraphvizDrawPass(config)); manager.Register("dfg-to-fluid", new DataFlowGraphToFluidPass); + Argument argument(FLAGS_inference_model_dir); + ASSERT_TRUE(&argument); ASSERT_TRUE(manager.Initialize(&argument)); manager.RunAll(); } -TEST_F(DFG_Tester, Node_pass_manager) { +TEST(PassManager, Node_pass_manager) { + Argument argument(FLAGS_inference_model_dir); // Pre-process: initialize the DFG with the ProgramDesc first. FluidToDataFlowGraphPass pass0; pass0.Initialize(&argument); diff --git a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc index 67dd4da54b95add703428e1fded61065f60353e8..39cc433b40fad17f4f12359d4e907a250a88bd63 100644 --- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc +++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc @@ -31,8 +31,8 @@ SubGraphSplitter::NodeInsideSubgraphTeller teller = [](const Node* node) { return false; }; -TEST_F(DFG_Tester, Split) { - auto desc = LoadProgramDesc(); +TEST(SubGraphSplitter, Split) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); LOG(INFO) << "spliter\n" << dfg.DotString(); @@ -63,8 +63,8 @@ TEST_F(DFG_Tester, Split) { ASSERT_EQ(subgraphs.back().size(), 6UL); } -TEST_F(DFG_Tester, Fuse) { - auto desc = LoadProgramDesc(); +TEST(SubGraphSplitter, Fuse) { + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); auto dfg = ProgramDescToDFG(desc); size_t count0 = dfg.nodes.size(); diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc index a6c15e848b99ca318f4583e3d4b88345fe8e5ebc..c1d932878e559180af987594535959afdf475587 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass_tester.cc @@ -22,11 +22,11 @@ namespace paddle { namespace inference { namespace analysis { -TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { +TEST(TensorRTSubgraphNodeMarkPass, test) { // init FluidToDataFlowGraphPass pass; + Argument argument(FLAGS_inference_model_dir); ASSERT_TRUE(pass.Initialize(&argument)); - argument.main_dfg.reset(new DataFlowGraph); pass.Run(argument.main_dfg.get()); TensorRTSubgraphNodeMarkPass::teller_t teller = [](const Node* node) { @@ -41,7 +41,7 @@ TEST_F(DFG_Tester, tensorrt_subgraph_node_mark_pass) { for (auto& node : argument.main_dfg->nodes.nodes()) { counter += node->attr(ATTR_supported_by_tensorrt).Bool(); } - + ASSERT_EQ(counter, 2); LOG(INFO) << counter << " nodes marked"; } diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc index 1d749d3fa3f39b351ccee6ebeb82467f7220a0b6..67a5af83d89b771536ea11be51b35244ff5c09d6 100644 --- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc +++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc @@ -25,7 +25,7 @@ namespace analysis { DEFINE_string(dot_dir, "./", ""); -TEST_F(DFG_Tester, tensorrt_single_pass) { +TEST(TensorRTSubGraphPass, main) { std::unordered_set teller_set( {"elementwise_add", "mul", "sigmoid"}); SubGraphSplitter::NodeInsideSubgraphTeller teller = [&](const Node* node) { @@ -35,7 +35,8 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { return false; }; - LOG(INFO) << "init"; + Argument argument(FLAGS_inference_model_dir); + DFG_GraphvizDrawPass::Config config{FLAGS_dot_dir, "origin"}; DFG_GraphvizDrawPass::Config config1{FLAGS_dot_dir, "fusion"}; @@ -44,13 +45,11 @@ TEST_F(DFG_Tester, tensorrt_single_pass) { FluidToDataFlowGraphPass pass0; TensorRTSubGraphPass trt_pass(std::move(teller)); - LOG(INFO) << "Initialize"; dfg_pass.Initialize(&argument); dfg_pass1.Initialize(&argument); pass0.Initialize(&argument); trt_pass.Initialize(&argument); - LOG(INFO) << "Run"; argument.main_dfg.reset(new DataFlowGraph); pass0.Run(argument.main_dfg.get()); dfg_pass.Run(argument.main_dfg.get()); diff --git a/paddle/fluid/inference/analysis/ut_helper.h b/paddle/fluid/inference/analysis/ut_helper.h index ce1191a567a4198f003520c40bf02487c48c56eb..1073a6f686eaeeaaae2d93ab044149b7df518085 100644 --- a/paddle/fluid/inference/analysis/ut_helper.h +++ b/paddle/fluid/inference/analysis/ut_helper.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/inference/analysis/data_flow_graph.h" #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h" -#include "paddle/fluid/inference/analysis/ut_helper.h" +#include "paddle/fluid/inference/analysis/helper.h" namespace paddle { namespace inference { @@ -32,27 +32,12 @@ namespace analysis { DEFINE_string(inference_model_dir, "", "inference test model dir"); -static framework::proto::ProgramDesc LoadProgramDesc( - const std::string& model_dir = FLAGS_inference_model_dir) { - std::string msg; - std::string net_file = FLAGS_inference_model_dir + "/__model__"; - std::ifstream fin(net_file, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", net_file); - fin.seekg(0, std::ios::end); - msg.resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(msg.at(0)), msg.size()); - fin.close(); - framework::proto::ProgramDesc program_desc; - program_desc.ParseFromString(msg); - return program_desc; -} - static DataFlowGraph ProgramDescToDFG( const framework::proto::ProgramDesc& desc) { DataFlowGraph graph; FluidToDataFlowGraphPass pass; Argument argument; + argument.fluid_model_dir.reset(new std::string(FLAGS_inference_model_dir)); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); pass.Initialize(&argument); pass.Run(&graph); @@ -63,7 +48,7 @@ static DataFlowGraph ProgramDescToDFG( class DFG_Tester : public ::testing::Test { protected: void SetUp() override { - auto desc = LoadProgramDesc(FLAGS_inference_model_dir); + auto desc = LoadProgramDesc(FLAGS_inference_model_dir + "/__model__"); argument.origin_program_desc.reset(new framework::proto::ProgramDesc(desc)); } diff --git a/paddle/fluid/inference/api/api_anakin_engine_tester.cc b/paddle/fluid/inference/api/api_anakin_engine_tester.cc index d6d631bfbad4278fe99e4553a410a9d9162dcc7b..7554fe4989b3f98e5af13dfb51b549083e4cd777 100644 --- a/paddle/fluid/inference/api/api_anakin_engine_tester.cc +++ b/paddle/fluid/inference/api/api_anakin_engine_tester.cc @@ -37,19 +37,21 @@ TEST(inference, anakin) { float data[1 * 3 * 224 * 224] = {1.0f}; - PaddleTensor tensor{.name = "input_0", - .shape = std::vector({1, 3, 224, 224}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor tensor; + tensor.name = "input_0"; + tensor.shape = std::vector({1, 3, 224, 224}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::FLOAT32; // For simplicity, we set all the slots with the same data. std::vector paddle_tensor_feeds; paddle_tensor_feeds.emplace_back(std::move(tensor)); - PaddleTensor tensor_out{.name = "prob_out", - .shape = std::vector({1000, 1}), - .data = PaddleBuf(), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor tensor_out; + tensor_out.name = "prob_out"; + tensor_out.shape = std::vector({1000, 1}); + tensor_out.data = PaddleBuf(); + tensor_out.dtype = PaddleDType::FLOAT32; std::vector outputs; outputs.emplace_back(std::move(tensor_out)); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 58fd7c6f8b05a846bd4a82068f09f5d9ef5a6516..08d7af6d3af7054061b15b904c69b2862c629562 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -183,6 +183,13 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), inputs[i].data.length()); + // TODO(Superjomn) Low performance, need optimization for heavy LoD copy. + framework::LoD lod; + for (auto &level : inputs[i].lod) { + lod.emplace_back(level); + } + input.set_lod(lod); + feeds->push_back(input); } return true; @@ -248,6 +255,10 @@ bool NativePaddlePredictor::GetFetch( buffer.Resize(sizeof(float) * data.size()); } std::memcpy(buffer.data(), data.data(), buffer.length()); + // copy LoD + for (const auto &level : fetchs[i].lod()) { + outputs->at(i).lod.emplace_back(level); + } outputs->at(i).dtype = PaddleDType::FLOAT32; // TODO(panyx0718): support other types? fill tensor name? avoid a copy. } diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc index c0891e9c281961fa03d278a0f5c676f92672c419..45b5a7638b7dc6a54bbd905766fd5c284cb6aea1 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc @@ -90,6 +90,18 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor { void OptimizeInferenceProgram() { // Analyze inference_program Argument argument; + if (!config_.model_dir.empty()) { + argument.fluid_model_dir.reset(new std::string(config_.model_dir)); + } else { + PADDLE_ENFORCE( + !config_.param_file.empty(), + "Either model_dir or (param_file, prog_file) should be set."); + PADDLE_ENFORCE(!config_.prog_file.empty()); + argument.fluid_model_program_path.reset( + new std::string(config_.prog_file)); + argument.fluid_model_param_path.reset( + new std::string(config_.param_file)); + } argument.origin_program_desc.reset( new ProgramDesc(*inference_program_->Proto())); Singleton::Global().Run(&argument); diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc index 62d98a796708612e7d4ff8abfd85125978ce22c7..fcbf9b89d608e7961e3ef81ac1c70e083dae1cc0 100644 --- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc +++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc @@ -49,11 +49,10 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) { std::vector data(20); for (int i = 0; i < 20; i++) data[i] = i; - PaddleTensor tensor{ - .name = "", - .shape = std::vector({10, 1}), - .data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({10, 1}); + tensor.data = PaddleBuf(data.data(), data.size() * sizeof(int64_t)); + tensor.dtype = PaddleDType::INT64; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc index 5f96fecf93f7a6c42bc6b9fe4e0d985c626388d7..03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72 100644 --- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc +++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc @@ -47,10 +47,10 @@ void Main(bool use_gpu) { //# 2. Prepare input. int64_t data[4] = {1, 2, 3, 4}; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; // For simplicity, we set all the slots with the same data. std::vector slots(4, tensor); @@ -94,10 +94,11 @@ void MainThreads(int num_threads, bool use_gpu) { for (int batch_id = 0; batch_id < num_batches; ++batch_id) { // 2. Dummy Input Data int64_t data[4] = {1, 2, 3, 4}; - PaddleTensor tensor{.name = "", - .shape = std::vector({4, 1}), - .data = PaddleBuf(data, sizeof(data)), - .dtype = PaddleDType::INT64}; + PaddleTensor tensor; + tensor.shape = std::vector({4, 1}); + tensor.data = PaddleBuf(data, sizeof(data)); + tensor.dtype = PaddleDType::INT64; + std::vector inputs(4, tensor); std::vector outputs; // 3. Run diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index 0a2a2b713ab21a3124d8a85ba469f64278623ec4..ddfe05a502b95abf52502853af861e5909148b9a 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -123,11 +123,11 @@ void Main(bool use_gpu) { file.close(); // Inference. - PaddleTensor input{ - .name = "xx", - .shape = record.shape, - .data = PaddleBuf(record.data.data(), record.data.size() * sizeof(float)), - .dtype = PaddleDType::FLOAT32}; + PaddleTensor input; + input.shape = record.shape; + input.data = + PaddleBuf(record.data.data(), record.data.size() * sizeof(float)); + input.dtype = PaddleDType::FLOAT32; VLOG(3) << "run executor"; std::vector output; diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 2f8b4f8596946988a728b5cf82de251bfda778a9..3342ee3c25446232e15b377229cdc303c0a0b40d 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -67,9 +67,9 @@ struct PaddleTensor { PaddleTensor() = default; std::string name; // variable name. std::vector shape; - // TODO(Superjomn) for LoD support, add a vector> field if needed. PaddleBuf data; // blob of data. PaddleDType dtype; + std::vector> lod; // lod data }; enum class PaddleEngineKind { diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 438b44b42aaf4c7e3ff05a5f7c52bbfd850e92c7..e14b148cc00f425e90b0b2256ab3462753a34f47 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -19,12 +19,17 @@ limitations under the License. */ #include // NOLINT #include +#include "gflags/gflags.h" + #include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/listen_and_serv_op.h" #include "paddle/fluid/platform/profiler.h" +DEFINE_int32(listen_and_serv_profile_period, 0, + "the period of listen_and_serv to do profile"); + namespace paddle { namespace operators { @@ -122,7 +127,18 @@ void ListenAndServOp::RunSyncLoop( std::shared_ptr(nullptr)); rpc_service_->ResetBarrierCounter(); + + int32_t profile_step = 0; while (true) { + PADDLE_ENFORCE_LE(profile_step, FLAGS_listen_and_serv_profile_period, + "profile_step should not be larger then " + "FLAGS_listen_and_serv_profile_period"); + if (FLAGS_listen_and_serv_profile_period > 0) { + if (profile_step == 0) { + auto pf_state = paddle::platform::ProfilerState::kCPU; + paddle::platform::EnableProfiler(pf_state); + } + } // Get from multiple trainers, we don't care about the order in which // the gradients arrives, just add suffix 0~n and merge the gradient. rpc_service_->SetCond(distributed::kRequestSend); @@ -164,6 +180,15 @@ void ListenAndServOp::RunSyncLoop( // reset received sparse vars to avoid reuse it in the next mini-batch dynamic_cast(request_send_handler_.get()) ->ResetSparseVarRecorder(); + if (FLAGS_listen_and_serv_profile_period > 0) { + if (profile_step == FLAGS_listen_and_serv_profile_period) { + paddle::platform::DisableProfiler( + paddle::platform::EventSortingKey::kTotal, "/dev/null"); + profile_step = 0; + } else { + profile_step++; + } + } } // while(true) } diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc index bb55ce21b0599bcff4db138a46c9c700f6e52422..1472edbbf47e3e4d6b22c65349713904b13647d2 100644 --- a/paddle/fluid/operators/math/im2col.cc +++ b/paddle/fluid/operators/math/im2col.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include +#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" namespace paddle { namespace operators { @@ -35,61 +36,18 @@ class Im2ColFunctordims().size() == 5); - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - - int channels_col = im_channels * filter_height * filter_width; - - const T* im_data = im.data(); - T* col_data = col->data(); - // TODO(TJ): change me to template - // further optimaze: - // 1. padding != 1 - // 2. could also support stride_h != 1 if (stride[0] == 1 && stride[1] == 1 && dilation[0] == 1 && - dilation[1] == 1 && padding[0] == 0 && padding[1] == 0) { - int col_matrix_width = output_width * output_height; - size_t copy_size = sizeof(T) * output_width; - for (int oh = 0; oh < output_height; ++oh) { - const T* im_data_start = im_data + oh * im_width; - T* dst_data = col_data + oh * output_width; - for (int ic = 0; ic < im_channels; ++ic) { - const T* src_data = im_data_start + ic * im_height * im_width; - for (int kh = 0; kh < filter_height; ++kh) { - for (int kw = 0; kw < filter_width; ++kw) { - std::memcpy(dst_data, src_data + kw, copy_size); - dst_data = dst_data + col_matrix_width; - } - src_data = src_data + im_width; - } - } - } - return; - } - - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? static_cast(0) - : im_data[im_idx]; - } + dilation[1] == 1) { + if (padding[0] == 0 && padding[1] == 0) { + im2col_sh1sw1dh1dw1ph0pw0(im, col); + return; + } else if (padding[0] == 1 && padding[1] == 1) { + im2col_sh1sw1dh1dw1ph1pw1(im, col); + return; } + // TODO(TJ): complete padding >=2 } + im2col_common(im, dilation, stride, padding, col); } }; diff --git a/paddle/fluid/operators/math/im2col_cfo_cpu.h b/paddle/fluid/operators/math/im2col_cfo_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..0d32bc5bd0d7f25479370959cabeb9b9c9e7e2d6 --- /dev/null +++ b/paddle/fluid/operators/math/im2col_cfo_cpu.h @@ -0,0 +1,252 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +/** + * The most common im2col algorithm. + * Support dilation, stride and padding. + */ +template +inline void im2col_common(const framework::Tensor& im, + const std::vector& dilation, + const std::vector& stride, + const std::vector& padding, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + int channels_col = im_channels * filter_height * filter_width; + + const T* im_data = im.data(); + T* col_data = col->data(); + for (int c = 0; c < channels_col; ++c) { + int w_offset = c % filter_width; + int h_offset = (c / filter_width) % filter_height; + int c_im = c / (filter_width * filter_height); + for (int h = 0; h < output_height; ++h) { + int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; + for (int w = 0; w < output_width; ++w) { + int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; + int col_idx = (c * output_height + h) * output_width + w; + int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; + col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || + im_col_idx < 0 || im_col_idx >= im_width) + ? static_cast(0) + : im_data[im_idx]; + } + } + } +} + +/** + * im2col algorithm with strides == 1, dilations == 1, paddings == 0 + */ +template +inline void im2col_sh1sw1dh1dw1ph0pw0(const framework::Tensor& im, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + + const T* im_data = im.data(); + T* col_data = col->data(); + int col_matrix_width = output_width * output_height; + int im_size = im_height * im_width; + size_t copy_size = sizeof(T) * output_width; + const T* im_data_oh = im_data; + T* dst_data_oh = col_data; + for (int oh = 0; oh < output_height; ++oh) { + const T* src_data_ic = im_data_oh; + T* dst_data = dst_data_oh; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = src_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + for (int kw = 0; kw < filter_width; ++kw) { + std::memcpy(dst_data, src_data + kw, copy_size); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + src_data_ic = src_data_ic + im_size; + } + im_data_oh = im_data_oh + im_width; + dst_data_oh = dst_data_oh + output_width; + } +} + +/** + * im2col algorithm with strides == 1, dilations == 1, paddings == 1 + * and filter_width == 1 have a special implementation + */ +template +inline void im2col_sh1sw1dh1dw1ph1pw1(const framework::Tensor& im, + framework::Tensor* col) { + int im_channels = im.dims()[0]; + int im_height = im.dims()[1]; + int im_width = im.dims()[2]; + int filter_height = col->dims()[1]; + int filter_width = col->dims()[2]; + int output_height = col->dims()[3]; + int output_width = col->dims()[4]; + + constexpr int plh = 1; + constexpr int prh = 1; + constexpr int plw = 1; + constexpr int prw = 1; + + const T* im_data = im.data(); + T* col_data = col->data(); + int im_size = im_height * im_width; + int col_matrix_width = output_width * output_height; + int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow + int col_block_ic = filter_height * col_block_fh; // fh*fw*oh*ow + + // fill height padding + { + size_t copy_size = sizeof(T) * output_width; + T* col_start_l = col_data; + T* col_start_r = col_data + (filter_height - 1) * col_block_fh + + col_matrix_width - output_width; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_l = col_start_l; + T* dst_data_r = col_start_r; + for (int kw = 0; kw < filter_width; ++kw) { + std::memset(dst_data_l, 0, copy_size); + std::memset(dst_data_r, 0, copy_size); + dst_data_l = dst_data_l + col_matrix_width; + dst_data_r = dst_data_r + col_matrix_width; + } + col_start_l = col_start_l + col_block_ic; + col_start_r = col_start_r + col_block_ic; + } + } + + auto pad = static_cast(0); + if (filter_width == 1) { + // fill width padding + T* dst_data_ic = col_data; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_kh = dst_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + T* dst_data = dst_data_kh; + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width - 1; + *dst_data = pad; + ++dst_data; + } + dst_data_kh = dst_data_kh + col_block_fh; + } + dst_data_ic = dst_data_ic + col_block_ic; + } + // fill core + size_t copy_size = sizeof(T) * (output_width - plw - prw); + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = + im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + col_matrix_width; + continue; + } + std::memcpy(dst_data + plw, src_data, copy_size); + dst_data = dst_data + col_matrix_width; + src_data = src_data + im_width; + } + } + } + return; + } + + // filter_width != 1 + // fill width padding + T* dst_data_ic = col_data; + for (int ic = 0; ic < im_channels; ++ic) { + T* dst_data_kh = dst_data_ic; + for (int kh = 0; kh < filter_height; ++kh) { + for (T* dst_data : + {dst_data_kh, dst_data_kh + (filter_width - prw) * col_matrix_width + + output_width - 1}) { + // TODO(TJ): from plh, saving repeated assignment + for (int oh = 0; oh < output_height; ++oh) { + *dst_data = pad; + dst_data = dst_data + output_width; + } + } + dst_data_kh = dst_data_kh + col_block_fh; + } + dst_data_ic = dst_data_ic + col_block_ic; + } + + // TODO(TJ): use array like: size_t copy_size[kw]={sizeof(T) * + // (output_width-1)} + // length of copy_size is equal kw. + for (int oh = 0; oh < output_height; ++oh) { + const T* im_data_start = im_data + (oh - plh > 0 ? oh - plh : 0) * im_width; + T* dst_data = col_data + oh * output_width; + for (int ic = 0; ic < im_channels; ++ic) { + const T* src_data = im_data_start + ic * im_size; + for (int kh = 0; kh < filter_height; ++kh) { + if ((oh < plh && kh < plh) || (oh > (output_height - prh - 1) && + kh > (filter_height - prh - 1))) { + dst_data = dst_data + filter_width * col_matrix_width; + continue; + } + // TODO(TJ): reuse plw-kw outside this for + // try to unify + for (int kw = 0; kw < plw; ++kw) { + std::memcpy(dst_data + (plw - kw), src_data, + sizeof(T) * (output_width - (plw - kw))); + dst_data = dst_data + col_matrix_width; + } + for (int kw = plw; kw < filter_width - prw; ++kw) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * output_width); + dst_data = dst_data + col_matrix_width; + } + int i = 1; + for (int kw = filter_width - prw; kw < filter_width; ++kw, ++i) { + std::memcpy(dst_data, src_data + (kw - plw), + sizeof(T) * (output_width - i)); + dst_data = dst_data + col_matrix_width; + } + src_data = src_data + im_width; + } + } + } +} + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index db61f68db3e492d98cfa43576fa1900bffc8674d..ae2c90b33a4298ada4fd01aa2a44ebdf10d036d4 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include +#include #include +#include "paddle/fluid/operators/math/im2col_cfo_cpu.h" template void testIm2col() { @@ -160,82 +162,111 @@ void testIm2col() { delete context; } -void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { - paddle::framework::Tensor input; - paddle::framework::Tensor output; - paddle::framework::Tensor ref_output; - std::vector padding({ph, pw}); - std::vector stride({1, 1}); // stride_y, stride_x - std::vector dilation({1, 1}); // dilation_y, dilation_x - int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; - int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; - float* input_ptr = - input.mutable_data({ic, ih, iw}, paddle::platform::CPUPlace()); - for (int i = 0; i < input.numel(); ++i) { - input_ptr[i] = static_cast(i + 1); - } - - paddle::platform::CPUPlace place; - paddle::platform::CPUDeviceContext context(place); - output.mutable_data({ic, fh, fw, output_height, output_width}, place); - ref_output.mutable_data({ic, fh, fw, output_height, output_width}, - place); - paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, - paddle::platform::CPUDeviceContext, float> - im2col; - im2col(context, input, dilation, stride, padding, &output); - auto ref_im2col = [&]( - const paddle::framework::Tensor& im, const std::vector& dilation, - const std::vector& stride, const std::vector& padding, - paddle::framework::Tensor* col) { - int im_channels = im.dims()[0]; - int im_height = im.dims()[1]; - int im_width = im.dims()[2]; - int filter_height = col->dims()[1]; - int filter_width = col->dims()[2]; - int output_height = col->dims()[3]; - int output_width = col->dims()[4]; - int channels_col = im_channels * filter_height * filter_width; - - const float* im_data = im.data(); - float* col_data = col->data(); - for (int c = 0; c < channels_col; ++c) { - int w_offset = c % filter_width; - int h_offset = (c / filter_width) % filter_height; - int c_im = c / (filter_width * filter_height); - for (int h = 0; h < output_height; ++h) { - int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0]; - for (int w = 0; w < output_width; ++w) { - int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1]; - int col_idx = (c * output_height + h) * output_width + w; - int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx; - col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height || - im_col_idx < 0 || im_col_idx >= im_width) - ? 0.f - : im_data[im_idx]; - } - } - } - }; - - ref_im2col(input, dilation, stride, padding, &ref_output); - - float* out_cfo_ptr = output.data(); - float* out_ref_ptr = ref_output.data(); - for (int i = 0; i < output.numel(); ++i) { - EXPECT_EQ(out_cfo_ptr[i], out_ref_ptr[i]); - } -} - TEST(math, im2col) { testIm2col(); - testIm2colCPU(/*ic*/ 3, /*ih*/ 5, /*iw*/ 5, /*fh*/ 3, /*fw*/ 2, /*ph*/ 0, - /*pw*/ 0); - testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ 1, - /*pw*/ 1); #ifdef PADDLE_WITH_CUDA testIm2col(); #endif } + +#define PREPARE_IM2COL_CPU \ + paddle::platform::CPUPlace place; \ + paddle::platform::CPUDeviceContext context(place); \ + paddle::framework::Tensor input; \ + paddle::framework::Tensor out; \ + paddle::framework::Tensor ref; \ + std::vector padding({ph, pw}); \ + std::vector stride({1, 1}); \ + std::vector dilation({1, 1}); \ + float* input_ptr = input.mutable_data({ic, ih, iw}, place); \ + for (int i = 0; i < input.numel(); ++i) { \ + input_ptr[i] = static_cast(i + 1); \ + } \ + int output_height = (ih - fh + padding[0] * 2) / stride[0] + 1; \ + int output_width = (iw - fw + padding[1] * 2) / stride[1] + 1; \ + out.mutable_data({ic, fh, fw, output_height, output_width}, place); \ + ref.mutable_data({ic, fh, fw, output_height, output_width}, place); \ + paddle::operators::math::Im2ColFunctor< \ + paddle::operators::math::ColFormat::kCFO, \ + paddle::platform::CPUDeviceContext, float> \ + im2col + +void testIm2colCPU(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { + PREPARE_IM2COL_CPU; + + im2col(context, input, dilation, stride, padding, &out); + paddle::operators::math::im2col_common(input, dilation, stride, + padding, &ref); + + float* ref_data = ref.data(); + float* out_data = out.data(); + for (int i = 0; i < out.numel(); ++i) { + EXPECT_EQ(out_data[i], ref_data[i]); + } +} + +void benchIm2col(int ic, int ih, int iw, int fh, int fw, int ph, int pw) { + PREPARE_IM2COL_CPU; + constexpr int repeat = 100; + auto GetCurrentMs = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+3 * time.tv_sec + 1e-3 * time.tv_usec; + }; + auto t1 = GetCurrentMs(); + for (int i = 0; i < repeat; ++i) { + im2col(context, input, dilation, stride, padding, &out); + } + auto t2 = GetCurrentMs(); + + for (int i = 0; i < repeat; ++i) { + paddle::operators::math::im2col_common(input, dilation, stride, + padding, &ref); + } + auto t3 = GetCurrentMs(); + + LOG(INFO) << "before: " << (t3 - t2) / repeat + << ",after: " << (t2 - t1) / repeat + << ",boost: " << ((t3 - t2) / (t2 - t1) - 1) * 100 << "%"; +} + +TEST(math, im2col_cputest) { + // padding_h == padding_w + for (int p = 0; p < 4; ++p) { + // width == height + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 5, /*fh*/ 4, /*fw*/ 4, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 3, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 2, /*ph*/ p, + /*pw*/ p); + + // height != width + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 5, /*iw*/ 4, /*fh*/ 1, /*fw*/ 3, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 5, /*fh*/ 3, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + + // filter == 1 + testIm2colCPU(/*ic*/ 3, /*ih*/ 4, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + testIm2colCPU(/*ic*/ 3, /*ih*/ 3, /*iw*/ 4, /*fh*/ 1, /*fw*/ 1, /*ph*/ p, + /*pw*/ p); + } + + // padding_h != padding_w + testIm2colCPU(/*ic*/ 2, /*ih*/ 4, /*iw*/ 4, /*fh*/ 2, /*fw*/ 3, /*ph*/ 1, + /*pw*/ 2); + + // benchmark + for (int p : {0, 1}) { + for (int k : {1, 3, 5}) { + LOG(INFO) << "padding == " << p << ", filter == " << k; + benchIm2col(/*ic*/ 3, /*ih*/ 224, /*iw*/ 224, /*fh*/ k, /*fw*/ k, + /*ph*/ p, /*pw*/ p); + } + } +} diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index a9fd1869c9df5464db6fc87ac633cdba2d6dbe7f..a1dfe39c3a4f84f5e4aaa2306813a7decf0e49ea 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -127,12 +127,6 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor). The output tensor of reshape operator."); AddAttr>( "shape", "(std::vector) Target shape of reshape operator."); - AddAttr("inplace", - "(default: false) Change the source tensor's shape without " - "memory copy. When Attr(inplace) is set true, the output " - "tensor shares memory with Input(X), otherwise, a new output " - "tensor is created, and its data are copied from Input(x).") - .SetDefault(false); AddComment(R"DOC( Reshape Operator. @@ -233,16 +227,9 @@ class ReshapeKernel { "sequence_reshape op."); } - bool inplace = ctx.Attr("inplace"); + out->mutable_data(ctx.GetPlace(), in->type()); + framework::TensorCopySync(*in, ctx.GetPlace(), out); out->Resize(out_dims); - if (!inplace) { - out->mutable_data(ctx.GetPlace(), in->type()); - framework::TensorCopySync(*in, ctx.GetPlace(), out); - out->Resize(out_dims); - } else { - out->ShareDataWith(*in); - out->Resize(out_dims); - } } }; @@ -251,19 +238,11 @@ class ReshapeGradKernel { void operator()(const framework::ExecutionContext &ctx) const { auto *d_out = ctx.Input(framework::GradVarName("Out")); auto *d_x = ctx.Output(framework::GradVarName("X")); + auto in_dims = d_x->dims(); d_x->mutable_data(ctx.GetPlace(), d_out->type()); - bool inplace = ctx.Attr("inplace"); - - auto in_dims = d_x->dims(); - if (!inplace) { - framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); - ctx.device_context().Wait(); - d_x->Resize(in_dims); - } else { - d_x->ShareDataWith(*d_out); - d_x->Resize(in_dims); - } + framework::TensorCopySync(*d_out, ctx.GetPlace(), d_x); + d_x->Resize(in_dims); } }; diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h index d263426e073d95ad6d584c7370baf596587a993d..c4af5a65fc5f81c1af7c1fdcca637ca37c940637 100644 --- a/paddle/fluid/operators/split_ids_op.h +++ b/paddle/fluid/operators/split_ids_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" @@ -67,10 +68,15 @@ class SplitIdsOpKernel : public framework::OpKernel { const auto &ids_rows = ids_selected_rows->rows(); auto outs = ctx.MultiOutput("Out"); const size_t shard_num = outs.size(); + for (auto &out : outs) { + out->mutable_rows()->clear(); + } // get rows for outputs - for (auto &id : ids_rows) { - size_t shard_id = static_cast(id) % shard_num; - outs[shard_id]->mutable_rows()->push_back(id); + std::unordered_map id_to_index; + for (size_t i = 0; i < ids_rows.size(); ++i) { + id_to_index[ids_rows[i]] = i; + size_t shard_id = static_cast(ids_rows[i]) % shard_num; + outs[shard_id]->mutable_rows()->push_back(ids_rows[i]); } int64_t row_width = ids_dims[1]; @@ -80,7 +86,8 @@ class SplitIdsOpKernel : public framework::OpKernel { {static_cast(out->rows().size()), row_width}); T *output = out->mutable_value()->mutable_data(ddim, place); for (int64_t i = 0; i < ddim[0]; ++i) { - memcpy(output + i * row_width, ids + out->rows()[i] * row_width, + memcpy(output + i * row_width, + ids + id_to_index[out->rows()[i]] * row_width, row_width * sizeof(T)); } } diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu index 4a47ba5ccad4de338844e60f6fcbd6b7c11e891b..ca5ca1caeb23f01c047feeccf9c276b2dcd1cb68 100644 --- a/paddle/fluid/platform/cuda_helper_test.cu +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -13,7 +13,6 @@ // limitations under the License. #include -#include #include #include @@ -25,13 +24,13 @@ using paddle::platform::PADDLE_CUDA_NUM_THREADS; using paddle::platform::float16; -#define CUDA_ATOMIC_KERNEL(op, T) \ - __global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \ - i += blockDim.x * gridDim.x) { \ - paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \ - } \ +template +__global__ void AddKernel(const T* data_a, T* data_b, size_t num) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]); } +} template struct AddFunctor { @@ -39,80 +38,116 @@ struct AddFunctor { }; template -struct SubFunctor { - T operator()(const T& a, const T& b) { return a - b; } -}; - -// NOTE(dzhwinter): the float16 add has small underflow/overflow -// so we use EXPECT_NEAR to check the result. -#define ARITHMETIC_KERNEL_LAUNCH(op, T) \ - void Test##T##op(size_t num) { \ - T *in1, *in2, *out; \ - T *d_in1, *d_in2; \ - size_t size = sizeof(T) * num; \ - cudaMalloc(reinterpret_cast(&d_in1), size); \ - cudaMalloc(reinterpret_cast(&d_in2), size); \ - in1 = reinterpret_cast(malloc(size)); \ - in2 = reinterpret_cast(malloc(size)); \ - out = reinterpret_cast(malloc(size)); \ - std::minstd_rand engine; \ - std::uniform_real_distribution dist(0.0, 1.0); \ - for (size_t i = 0; i < num; ++i) { \ - in1[i] = static_cast(dist(engine)); \ - in2[i] = static_cast(dist(engine)); \ - } \ - cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ - cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ - op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \ - cudaDeviceSynchronize(); \ - cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \ - cudaDeviceSynchronize(); \ - for (size_t i = 0; i < num; ++i) { \ - EXPECT_NEAR(static_cast(out[i]), \ - static_cast(op##Functor()(in1[i], in2[i])), \ - 0.001); \ - } \ - free(in1); \ - free(in2); \ - free(out); \ - cudaFree(d_in1); \ - cudaFree(d_in2); \ +void TestCase(size_t num) { + T *in1, *in2, *out; + T *d_in1, *d_in2; + size_t size = sizeof(T) * num; + cudaMalloc(reinterpret_cast(&d_in1), size); + cudaMalloc(reinterpret_cast(&d_in2), size); + in1 = reinterpret_cast(malloc(size)); + in2 = reinterpret_cast(malloc(size)); + out = reinterpret_cast(malloc(size)); + std::minstd_rand engine; + std::uniform_real_distribution dist(0.0, 1.0); + for (size_t i = 0; i < num; ++i) { + in1[i] = static_cast(dist(engine)); + in2[i] = static_cast(dist(engine)); } -CUDA_ATOMIC_KERNEL(Add, float); -CUDA_ATOMIC_KERNEL(Add, double); -CUDA_ATOMIC_KERNEL(Add, float16); - -ARITHMETIC_KERNEL_LAUNCH(Add, float); -ARITHMETIC_KERNEL_LAUNCH(Add, double); -ARITHMETIC_KERNEL_LAUNCH(Add, float16); - -namespace paddle { -namespace platform { -USE_CUDA_ATOMIC(Sub, int); -}; -}; -CUDA_ATOMIC_KERNEL(Sub, int); -ARITHMETIC_KERNEL_LAUNCH(Sub, int); + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); + AddKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); + cudaDeviceSynchronize(); + cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + for (size_t i = 0; i < num; ++i) { + // NOTE(dzhwinter): the float16 add has small underflow/overflow + // so we use EXPECT_NEAR to check the result. + EXPECT_NEAR(static_cast(out[i]), + static_cast(AddFunctor()(in1[i], in2[i])), 0.001); + } + free(in1); + free(in2); + free(out); + cudaFree(d_in1); + cudaFree(d_in2); +} // cuda primitives TEST(CudaAtomic, Add) { - TestfloatAdd(static_cast(10)); - TestfloatAdd(static_cast(1024 * 1024)); - TestdoubleAdd(static_cast(10)); - TestdoubleAdd(static_cast(1024 * 1024)); -} + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); -TEST(CudaAtomic, Sub) { - TestintSub(static_cast(10)); - TestintSub(static_cast(1024 * 1024)); + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); } TEST(CudaAtomic, float16) { - using paddle::platform::float16; - Testfloat16Add(static_cast(1)); - Testfloat16Add(static_cast(2)); - Testfloat16Add(static_cast(3)); + TestCase(static_cast(1)); + TestCase(static_cast(2)); + TestCase(static_cast(3)); + + TestCase(static_cast(10)); + TestCase(static_cast(1024 * 1024)); +} + +// unalignment of uint8 +void TestUnalign(size_t num, const int shift_bit) { + PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2"); + float16 *in1, *in2, *out; + float16 *d_in1, *d_in2; + size_t size = sizeof(uint8_t) * (num + shift_bit); + size_t array_size = sizeof(float16) * (num / 2); + + cudaMalloc(reinterpret_cast(&d_in1), size); + cudaMalloc(reinterpret_cast(&d_in2), size); + in1 = reinterpret_cast(malloc(size)); + in2 = reinterpret_cast(malloc(size)); + out = reinterpret_cast(malloc(size)); + + // right shift 1, mimic the unalignment of address + float16* r_in1 = + reinterpret_cast(reinterpret_cast(in1) + shift_bit); + float16* r_in2 = + reinterpret_cast(reinterpret_cast(in2) + shift_bit); + + std::minstd_rand engine; + std::uniform_real_distribution dist(0.0, 1.0); + for (size_t i = 0; i < num / 2; ++i) { + r_in1[i] = static_cast(dist(engine)); + r_in2[i] = static_cast(dist(engine)); + } + cudaMemcpy(d_in1, r_in1, array_size, cudaMemcpyHostToDevice); + cudaMemcpy(d_in2, r_in2, array_size, cudaMemcpyHostToDevice); + AddKernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num / 2); + cudaDeviceSynchronize(); + cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); + for (size_t i = 0; i < num / 2; ++i) { + // NOTE(dzhwinter): the float16 add has small underflow/overflow + // so we use EXPECT_NEAR to check the result. + EXPECT_NEAR(static_cast(out[i]), + static_cast(AddFunctor()(r_in1[i], r_in2[i])), + 0.001); + } + free(in1); + free(in2); + free(out); + cudaFree(d_in1); + cudaFree(d_in2); +} + +TEST(CudaAtomic, float16Unalign) { + // same with float16 testcase + TestUnalign(static_cast(2), /*shift_bit*/ 2); + TestUnalign(static_cast(1024), /*shift_bit*/ 2); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 2); + + // shift the address. + TestUnalign(static_cast(2), /*shift_bit*/ 1); + TestUnalign(static_cast(1024), /*shift_bit*/ 1); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 1); - Testfloat16Add(static_cast(10)); - Testfloat16Add(static_cast(1024 * 1024)); + TestUnalign(static_cast(2), /*shift_bit*/ 3); + TestUnalign(static_cast(1024), /*shift_bit*/ 3); + TestUnalign(static_cast(1024 * 1024), /*shift_bit*/ 3); } diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index 94ce83975a7f13daa2b6a4d480cb22cc95811b9b..67ea64833d3b844d88a2e5996f860ef165bd8ffd 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -79,41 +79,41 @@ CUDA_ATOMIC_WRAPPER(Add, double) { // convert the value into float and do the add arithmetic. // then store the result into a uint32. -inline __device__ uint32_t add_to_low_half(uint32_t val, float x) { +inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) { float16 low_half; // the float16 in lower 16bits - low_half.x = static_cast(val & 0xffffu); + low_half.x = static_cast(val & 0xFFFFu); low_half = static_cast(static_cast(low_half) + x); - return (val & 0xffff0000u) | low_half.x; + return (val & 0xFFFF0000u) | low_half.x; } -inline __device__ uint32_t add_to_high_half(uint32_t val, float x) { +inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) { float16 high_half; // the float16 in higher 16bits high_half.x = static_cast(val >> 16); high_half = static_cast(static_cast(high_half) + x); - return (val & 0xffffu) | (static_cast(high_half.x) << 16); + return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); } CUDA_ATOMIC_WRAPPER(Add, float16) { // concrete packed float16 value may exsits in lower or higher 16bits // of the 32bits address. - uint32_t *address_as_ui = - reinterpret_cast(reinterpret_cast(address) - - (reinterpret_cast(address) & 2)); + uint32_t *address_as_ui = reinterpret_cast( + reinterpret_cast(address) - + (reinterpret_cast(address) & 0x02)); float val_f = static_cast(val); uint32_t old = *address_as_ui; uint32_t sum; uint32_t newval; uint32_t assumed; - if (((size_t)address & 2) == 0) { + if (((uintptr_t)address & 0x02) == 0) { // the float16 value stay at lower 16 bits of the address. do { assumed = old; old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); } while (old != assumed); float16 ret; - ret.x = old & 0xffffu; + ret.x = old & 0xFFFFu; return ret; } else { // the float16 value stay at higher 16 bits of the address. diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0f334b2892d77b836728cf79898d1832e90e7c00..a8bc16f1b5b9b624e88e355d8ce4741fcec34bc3 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -534,7 +534,7 @@ EOF make -j `nproc` inference_lib_dist cd ${PADDLE_ROOT}/build cp -r fluid_install_dir fluid - tar -cf fluid.tgz fluid + tar -czf fluid.tgz fluid fi } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index d1d6dd75ee98411fcd7d444b18f9838064b774b0..956e3c43485b36aaeb2d366d6145edd3d4535122 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -127,6 +127,7 @@ def __bootstrap__(): ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') + read_env_flags.append('listen_and_serv_profile_period') if core.is_compiled_with_cuda(): read_env_flags += [ diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index f05ae6d5d1900560e37370121bf64f1fcab14357..3ee1c636ace504e14cf7d6c106df1bc3e864d660 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -21,6 +21,7 @@ from ..layer_helper import LayerHelper, unique_name from ..initializer import force_init_on_cpu from ops import logical_and, logical_not, logical_or import numpy +import warnings __all__ = [ 'While', @@ -280,6 +281,9 @@ class ParallelDo(object): """ def __init__(self, places, use_nccl=False, name=None): + warnings.warn( + "API ParallelDo is deprecated since 0.15.0. Please use ParallelExecutor instead.", + Warning) self.helper = LayerHelper("parallel_do", name=name) self.inputs = [] self.places = places @@ -338,7 +342,7 @@ class ParallelDo(object): return [parent_block.var(name) for name in params] - def complete_op(self): + def _complete_op(self): main_program = self.helper.main_program current_block = main_program.current_block() parent_block = self.parent_block() @@ -394,7 +398,7 @@ class BlockGuardWithCompletion(BlockGuard): if exc_type is not None: return False self.rnn.status = StaticRNN.AFTER_RNN_BLOCK - self.rnn.complete_op() + self.rnn._complete_op() return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val, exc_tb) @@ -470,7 +474,7 @@ class StaticRNN(object): if shape is None or batch_ref is None: raise ValueError( "if init is None, memory at least need shape and batch_ref") - parent_block = self.parent_block() + parent_block = self._parent_block() var_name = unique_name.generate("@".join( [self.helper.name, "memory_boot"])) boot_var = parent_block.create_var( @@ -527,7 +531,7 @@ class StaticRNN(object): outputs={'Out': tmp_o}, attrs={'dtype': o.dtype}) - out_var = self.parent_block().create_var( + out_var = self._parent_block().create_var( name=tmp_o.name, shape=[self.seq_len] + list(tmp_o.shape), dtype=tmp_o.dtype) @@ -543,7 +547,7 @@ class StaticRNN(object): raise TypeError("update memory should take variables") self.memories[mem.name].mem = var - def parent_block(self): + def _parent_block(self): prog = self.helper.main_program parent_idx = prog.current_block().parent_idx assert parent_idx >= 0 @@ -560,10 +564,10 @@ class StaticRNN(object): else: return self.outputs - def complete_op(self): + def _complete_op(self): main_program = self.helper.main_program rnn_block = main_program.current_block() - parent_block = self.parent_block() + parent_block = self._parent_block() local_inputs = set() @@ -643,7 +647,7 @@ class WhileGuard(BlockGuard): if exc_type is not None: return False self.while_op.status = While.AFTER_WHILE_BLOCK - self.while_op.complete() + self.while_op._complete() return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb) @@ -690,7 +694,7 @@ class While(object): def block(self): return WhileGuard(self) - def complete(self): + def _complete(self): main_program = self.helper.main_program while_block = main_program.current_block() parent_block = main_program.block(main_program.current_block() diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 5d7f1eadd93a82dc2bdb88c5f5c80e437df4e29f..058acd4a50ef54cea724a742d40eaca8f569a21c 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4473,15 +4473,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): "except one unknown dimension.") helper = LayerHelper("reshape", **locals()) - reshaped = helper.create_tmp_variable(dtype=x.dtype) + out = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( type="reshape", inputs=inputs, - attrs={"shape": shape, - "inplace": inplace}, - outputs={"Out": reshaped}) + attrs={"shape": shape}, + outputs={"Out": out}) - return helper.append_activation(reshaped) + return helper.append_activation(out) def lod_reset(x, y=None, target_lod=None): diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py index cfd6e63e12258a92447e68b4afbc7ead91b68cc1..67733807f8f8582f68dcfa3f361e13a631a29597 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py @@ -43,5 +43,29 @@ class TestControlFlowGraph(unittest.TestCase): print(str(result_program)) +class TestMemoryTranspiler2(unittest.TestCase): + def setUp(self): + program = Program() + with program_guard(program, startup_program=Program()): + x = layers.data(name='x', shape=[13], dtype='float32') + fc = layers.fc(input=x, size=10, act=None) + reshape = layers.reshape(x=fc, shape=[-1, 2, 5]) + fc = layers.reshape(x=reshape, shape=[-1, 5, 2]) + y_predict = layers.fc(input=fc, size=1, act=None) + y = layers.data(name='y', shape=[1], dtype='float32') + cost = layers.square_error_cost(input=y_predict, label=y) + avg_cost = layers.mean(cost) + opt = optimizer.SGD(learning_rate=0.001) + opt.minimize(avg_cost) + self.program = program + + def test_inplace_ops(self): + print("before optimization") + print(str(self.program)) + result_program = memory_optimize(self.program) + print("after optimization") + print(str(result_program)) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index f51b5a7e9907294a5b91c920a363830d8b9a7137..2f5558578ac2a002a83c2a7e027ec5a96d8b4414 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -25,7 +25,7 @@ class TestReshapeOp(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} def test_check_output(self): @@ -42,7 +42,7 @@ class TestReshapeOpDimInfer1(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} def test_check_output(self): @@ -60,7 +60,7 @@ class TestReshapeOpDimInfer2(OpTest): self.op_type = "reshape" self.inputs = {"X": np.random.random(ori_shape).astype("float32")} - self.attrs = {"shape": new_shape, "inplace": False} + self.attrs = {"shape": new_shape} self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} def test_check_output(self): diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py index e9f0a06a56b42952800411d548bb3fc1732e031e..ca7861309839d183e18c168403881a0b1b5bf309 100644 --- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py +++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py @@ -15,6 +15,8 @@ import unittest import numpy as np from op_test import OpTest +import paddle.fluid.core as core +from paddle.fluid.op import Operator class TestSplitIdsOp(OpTest): @@ -31,5 +33,55 @@ class TestSplitIdsOp(OpTest): self.check_output() +class TestSpliteIds(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + return places + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + def check_with_place(self, place): + scope = core.Scope() + rows = [0, 5, 7, 4, 9] + height = 20 + row_numel = 2 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(rows) + x.set_height(height) + np_array = np.ones((len(rows), row_numel)).astype("float32") + for i in range(len(rows)): + for j in range(row_numel): + np_array[i, j] = rows[i] + j + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + outs_name = ["out%d" % i for i in xrange(3)] + outs = [ + scope.var(var_name).get_selected_rows() for var_name in outs_name + ] + + # expected output selected rows + expected_out_rows = [[0, 9], [7, 4], [5]] + + op = Operator("split_ids", Ids="X", Out=outs_name) + + for _ in range(3): + op.run(scope, place) + + for i in range(len(outs)): + expected_rows = expected_out_rows[i] + self.assertEqual(outs[i].rows(), expected_rows) + for j in range(len(expected_rows)): + row = expected_rows[j] + self.assertAlmostEqual( + float(row), np.array(outs[i].get_tensor())[j, 0]) + self.assertAlmostEqual( + float(row + 1), np.array(outs[i].get_tensor())[j, 1]) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 2d9c089c0b7667c875aae05cb4e6040b007f3d55..d4d19799fdb291545117f327d2b9b2c25fbfe5f5 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -495,6 +495,7 @@ class DistributeTranspiler(object): pserver_index = self.pserver_endpoints.index(endpoint) table_opt_block = self._create_table_optimize_block( pserver_index, pserver_program, pre_block_idx, grad_to_block_id) + optimize_blocks.append(table_opt_block) prefetch_var_name_to_block_id = self._create_prefetch_block( pserver_index, pserver_program, table_opt_block) checkpoint_block_id = self._create_checkpoint_save_block( diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index bca0b77ad71a3f65dda15191e5f540bfc2e043d1..0b72ea323b72a1a6cfd0911416c4037243d06ff4 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -13,7 +13,7 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH} ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig -RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz +RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz COPY build_scripts /build_scripts RUN bash build_scripts/build.sh && \ bash build_scripts/install_nccl2.sh && rm -r build_scripts