diff --git a/AUTHORS.md b/AUTHORS.md index 8c4a113fc276783c945867ceae9612339b7f0bbc..41b7193677a0208ba2fa82b72862292572dcb6ef 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -46,6 +46,7 @@ | tianbingsz | Tian-Bing Xu | | tpatejko | Tomasz Patejko | | typhoonzero | Yi Wu | +| velconia | Qi-Yang Min | | wanghaoshuang | Hao-Shuang Wang | | wangyang59 | Yang Wang | | wangzhen-nlp | Zhen Wang | diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index f8aed5a5e06c5e29dbdfb5db9f2ea0344c7eed6d..6b22f8f520e3d9c6c89d41a7455a6f9ebbad6d80 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -85,8 +85,7 @@ def dist_transpile(trainer_id, args): trainer_id, pservers=pserver_endpoints, trainers=trainers, - sync_mode=not args.async_mode, - slice_var_up=not args.no_split_var) + sync_mode=not args.async_mode) if training_role == "PSERVER": pserver_program = t.get_pserver_program(current_endpoint) pserver_startup_program = t.get_startup_program(current_endpoint, diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 82437a84248fece843c3659c9422d9b579b5066f..7fb67afbe15a5a019c978092d5ba3a4a0f66d996 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -50,7 +50,7 @@ ExternalProject_Add( UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_IN_SOURCE 1 - PATCH_COMMAND git apply ${PADDLE_SOURCE_DIR}/patches/grpc/fix_too_early_destory.patch + PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h # NOTE(yuyang18): # Disable -Werror, otherwise the compile will fail in MacOS. # It seems that we cannot configure that by make command. diff --git a/cmake/generic.cmake b/cmake/generic.cmake index eafb11b6f21e226fc68556a78d675dea94080140..07bab994d354df834d0667c69f307b2d7684fb22 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -263,7 +263,7 @@ function(cc_test TARGET_NAME) COMMAND ${TARGET_NAME} ${cc_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (${cc_test_SERIAL}) - set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) endif() endif() @@ -328,7 +328,7 @@ function(nv_test TARGET_NAME) add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_test(${TARGET_NAME} ${TARGET_NAME}) if (nv_test_SERIAL) - set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) endif() endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index e2c58cd56055455e7fedc598ca8f56183d4b51dc..aeb081e76e5bc5b9d3d81ce625195c800174ab6c 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -148,18 +148,11 @@ if (WITH_ANAKIN AND WITH_GPU) list(APPEND inference_deps anakin_inference_lib) endif() -copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared - SRCS ${src_dir}/${module}/paddle_inference_api.h - ${src_dir}/${module}/demo_ci - ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api* - DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference -) -list(APPEND inference_deps inference_api_lib) - set(module "inference") copy(inference_lib DEPS ${inference_deps} SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* - DSTS ${dst_dir}/${module} ${dst_dir}/${module} + ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci + DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) set(module "platform") diff --git a/doc/fluid/design/ir/draft.md b/doc/fluid/design/ir/draft.md index a33b5a9c9312c93247a1e1f3431061a5aad6c884..c29337cba1fe859e4968cb800e4e7d9ff6a54d31 100644 --- a/doc/fluid/design/ir/draft.md +++ b/doc/fluid/design/ir/draft.md @@ -64,6 +64,41 @@ can also contain other things that describe some properties of the `Graph` or `Graph` nodes. `Attribute` can be passed across `Pass`. However, it should be used with care. +```cpp +class Graph { + public: + explicit Graph(const ProgramDesc &program); + + bool Has(const std::string &attr_name) const; + + template + AttrType &Get(const std::string &attr_name) const; + + template + void Set(const std::string &attr_name, AttrType *attr); + const std::unordered_set &Nodes() const; + + // Create a normal variable with non-null VarDesc. + ir::Node *CreateVarNode(VarDesc *var_desc); + + // Create a normal runnable operator with OpDesc. + ir::Node *CreateOpNode(OpDesc *op_desc); + + // Create a control dependency var that connects 2 operations. The + // var doesn't hold any data. Other than that, it's no different from + // other var, considering dependency analysis. + ir::Node *CreateControlDepVar(); + + // A more free style way of creating a graph node. Mostly use for test + // or "copy" from another node. Avoid using it if possible. + ir::Node *CreateEmptyNode(const std::string &name, ir::Node::Type type); + + // Clear all node information of the graph and return the ownership of the + // nodes. + std::vector> ReleaseNodes(); +}; +``` + #### Pass `Pass` represents a transformation of `Graph`. Its input @@ -71,6 +106,54 @@ is a `Graph` and its output is also a `Graph`. For example, a `Pass` can simply print out the `Graph`. A `Pass` can also fuse some `Graph`'s `Node`s. +```cpp +class Pass { + public: + + std::unique_ptr Apply(std::unique_ptr graph) const { + // Some correctness check. + auto new_graph = ApplyImpl(std::move(graph)); + // Some correctness check. + return new_graph; + } + + // Get a reference to the attributed previously set. + template + AttrType &Get(const std::string &attr_name) const; + + // Set a pointer to the attribute. Pass takes ownership of the attribute. + template + void Set(const std::string &attr_name, AttrType *attr) ; + + // Set a pointer to the attribute. Pass doesn't take ownership. Caller + // should delete the attribute. + template + void SetNotOwned(const std::string &attr_name, AttrType *attr); + + protected: + virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const = 0; +}; + +// In my_pass.cc +class MyPass : public Pass { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const override { + // do something. + return graph; + } +} +REGISTER_PASS(my_pass, MyPass) +.RequirePassAttr("places") +.RequireGraphAttr("dep_vars"); + + +// To use the pass. +auto my_pass = ir::PassRegistry::Instance().Get("my_pass"); +graph = my_pass->Apply(std::move(graph)); +// Note: to force link my_pass.cc, in the code: +USE_PASS(my_pass); +``` + #### Optimize `Optimize` contains a series of `Pass` with defined order. @@ -86,4 +169,17 @@ maintaining the original modeling logic. * Graph is transformed from raw model logic to a form that is efficient to execute. -Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor +``` +// Program->ProgramToGraph->Graph->Pass1->Graph->Pass2->Graph->Pass3->Graph->Executor +auto graph = Graph(program); +graph = PassRegistry::Instance().Get("op_fuse_pass").Apply(std::move(grah)); +// For more complex Pass, Optimize Process can provide Pass attributes. +auto mem_opt_pass = PassRegistry::Instance().Get("memory_optimization_pass"); +mem_opt_pass.SetNotOwned("optimize_level", 1); +mem_opt_pass->Apply(std::move(graph)); +graph = PassRegistry::Instance().Get("multi_device_pass").Apply(std::move(grah)); +graph = PassRegistry::Instance().Get("multi_device_check_pass").Apply(std::move(grah)); +Executor exe; +exe.Run(graph); + +``` diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 93ec047c8012e41cc9dfb651e8de2b4749f93299..139411f3e0d945f9265d19a28487c05d06722d69 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -8,9 +8,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context) if(WITH_GPU) - nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type) + nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context) else() - cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type) + cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context) endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) @@ -99,7 +99,7 @@ else() endif() -cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) @@ -110,7 +110,7 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) - + # cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 9df7df1f42886d40210b16aa2ae5823e3310bfe7..5d652d37307d0a55ffee14930ae180dcd3e27841 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -31,9 +31,6 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle) - -cc_library(ssa_graph_builder_factory SRCS ssa_graph_builder_factory.cc DEPS multi_devices_graph_builder ssa_graph_printer ssa_graph_checker) - cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 2f2869b1634256c3745e733bb1b99bfe4ddf8924..5ca2ed8f96244a11925dfa6af8e48458cf334ecd 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -34,30 +34,22 @@ namespace paddle { namespace framework { namespace details { +static const char kLossVarName[] = "loss_var_name"; +static const char kPlaces[] = "places"; +static const char kParams[] = "params"; +static const char kLocalScopes[] = "local_scopes"; +static const char kStrategy[] = "strategy"; + +void MultiDevSSAGraphBuilder::Init() const { + loss_var_name_ = Get(kLossVarName); + places_ = Get>(kPlaces); + local_scopes_ = Get>(kLocalScopes); + strategy_ = Get(kStrategy); #ifdef PADDLE_WITH_CUDA -MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( - const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy) - : loss_var_name_(loss_var_name), - places_(places), - local_scopes_(local_scopes), - nccl_ctxs_(nccl_ctxs), - strategy_(strategy) { -#else -MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( - const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, const BuildStrategy &strategy) - : loss_var_name_(loss_var_name), - places_(places), - local_scopes_(local_scopes), - strategy_(strategy) { + nccl_ctxs_ = &Get("nccl_ctxs"); #endif - for (auto &p : params) { + + for (auto &p : Get>(kParams)) { grad_names_.insert(GradVarName(p)); } balance_vars_.resize(places_.size(), 0); @@ -72,7 +64,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, ir::Node *node, size_t place_id) const { auto p = places_[place_id]; - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -239,8 +231,9 @@ std::vector SortOpsAndDelayOptimizeOp(const ir::Graph &graph) { return sorted_ret; } -std::unique_ptr MultiDevSSAGraphBuilder::Apply( +std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( std::unique_ptr graph) const { + Init(); // Give the topology sort order and rebuild the graph structure. std::vector sorted_ops = SortOpsAndDelayOptimizeOp(*graph); auto nodes = graph->ReleaseNodes(); @@ -254,9 +247,10 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( std::unordered_set og_has_been_broadcast; // We cannot invoke resize. It is a bug of GCC 4.8 - result.Set("vars", new GraphVars(places_.size())); - result.Set("dep_vars", new GraphDepVars); - result.Set("ops", new GraphOps); + result.Set(kGraphVars, new GraphVars(places_.size())); + result.Set(kGraphDepVars, new GraphDepVars); + result.Set(kGraphOps, new GraphOps); + result.Set(kShardedVarDevice, new ShardedVarDevice); // find send/recv vars so that we can place the distributed training // related op in the place 0 @@ -289,11 +283,12 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( // the block. is_forwarding = false; } else { - int op_dev_id = GetOpDeviceID(node); + int op_dev_id = GetOpDeviceID(result, node); if (op_dev_id != -1) { // This op only runs on one specific device. CreateComputationalOp(&result, node, op_dev_id); for (ir::Node *n : node->outputs) { - var_name_on_devices_.emplace(n->Name(), op_dev_id); + graph->Get(kShardedVarDevice) + .emplace(n->Name(), op_dev_id); } } else { // This op runs on all devices, and its output may have parameter's @@ -330,7 +325,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::Apply( case BuildStrategy::ReduceStrategy::kReduce: cur_device_id = GetAppropriateDeviceID({g_name}); CreateReduceOp(&result, g_name, cur_device_id); - var_name_on_devices_.emplace(g_name, cur_device_id); + graph->Get(kShardedVarDevice) + .emplace(g_name, cur_device_id); bcast_var_name_set[cur_device_id].emplace(p_name); break; case BuildStrategy::ReduceStrategy::kAllReduce: @@ -416,16 +412,16 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, places_); #endif - result->Get("ops").emplace_back(op_handle); + result->Get(kGraphOps).emplace_back(op_handle); auto *in = - result->Get("vars").at(src_dev_id).at(p_name).back().get(); + result->Get(kGraphVars).at(src_dev_id).at(p_name).back().get(); op_handle->AddInput(in); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars").at(i).at(p_name); + auto &vars = result->Get(kGraphVars).at(i).at(p_name); auto *out_var = new VarHandle( result->CreateEmptyNode(p_name, ir::Node::Type::kVariable), vars.size(), i, p_name, p); @@ -437,7 +433,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const { - result->Get("ops").emplace_back( + result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), local_scopes_[dev_id], places_[dev_id])); CreateOpHandleIOs(result, node, dev_id); @@ -446,20 +442,20 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, const std::string &og) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new AllReduceOpHandle( + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new AllReduceOpHandle( + result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars")[i][og]; + auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); @@ -475,20 +471,20 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ir::Graph *result, const std::vector &datas) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new DataBalanceOpHandle( + result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new DataBalanceOpHandle( + result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); for (const std::string &d_name : datas) { - auto &vars = result->Get("vars")[i][d_name]; + auto &vars = result->Get(kGraphVars)[i][d_name]; PADDLE_ENFORCE(!vars.empty()); op_handle->AddInput(vars.back().get()); auto var = new VarHandle( @@ -512,7 +508,8 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce( return is_pg_once; } -int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { +int MultiDevSSAGraphBuilder::GetOpDeviceID(const ir::Graph &graph, + ir::Node *node) const { if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { return -1; } @@ -525,15 +522,17 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(param_grad[1]); + int dev_id = GetVarDeviceID(graph, param_grad[1]); PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", node->Op()->Type(), param_grad[0], param_grad[1]); return dev_id; } -int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const { - auto got = var_name_on_devices_.find(varname); - return got == var_name_on_devices_.end() ? -1 : got->second; +int MultiDevSSAGraphBuilder::GetVarDeviceID(const ir::Graph &graph, + const std::string &varname) const { + auto &sharded_var_device = graph.Get(kShardedVarDevice); + auto got = sharded_var_device.find(varname); + return got == sharded_var_device.end() ? -1 : got->second; } void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { @@ -551,7 +550,7 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp(ir::Graph *result) const { result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), local_scopes_.size(), local_scopes_[i], places_[i], communication_dev_ctx); - result->Get("ops").emplace_back(op_handle); + result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale // factor. So it does not depend on any other operators. @@ -572,7 +571,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->Get("ops").emplace_back( + result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); CreateOpHandleIOs(result, node, scope_idx); } @@ -582,25 +581,25 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const { #ifdef PADDLE_WITH_CUDA - result->Get("ops").emplace_back(new ReduceOpHandle( + result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); #else - result->Get("ops").emplace_back(new ReduceOpHandle( + result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_)); #endif - auto *op_handle = result->Get("ops").back().get(); + auto *op_handle = result->Get(kGraphOps).back().get(); for (size_t i = 0; i < places_.size(); ++i) { auto &p = places_[i]; SetCommunicationContext(op_handle, p); - auto &vars = result->Get("vars")[i][og]; + auto &vars = result->Get(kGraphVars)[i][og]; PADDLE_ENFORCE(!vars.empty()); auto &prev_grad = vars.back(); op_handle->AddInput(prev_grad.get()); } - auto &vars = result->Get("vars")[dst_dev_id][og]; + auto &vars = result->Get(kGraphVars)[dst_dev_id][og]; auto var = new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), vars.size(), dst_dev_id, og, places_[dst_dev_id]); @@ -613,11 +612,11 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, // on it. void MultiDevSSAGraphBuilder::ConnectOp(ir::Graph *result, OpHandleBase *op, const std::string &prev_op_name) const { - for (auto &prev_op : result->Get("ops")) { + for (auto &prev_op : result->Get(kGraphOps)) { if (prev_op->Name() == prev_op_name) { auto *dep_var = new DummyVarHandle(result->CreateControlDepVar()); prev_op->AddOutput(dep_var); - result->Get("dep_vars").emplace(dep_var); + result->Get(kGraphDepVars).emplace(dep_var); op->AddInput(dep_var); } } @@ -638,20 +637,23 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, if (node->Op()->Type() == "split_byref" || node->Op()->Type() == "split_selected_rows") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(input_var_names[0]); + op_dev_id = GetVarDeviceID(*result, input_var_names[0]); if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } else if (node->Op()->Type() == "concat") { - op_dev_id = GetVarDeviceID(input_var_names[0]); + op_dev_id = GetVarDeviceID(*result, input_var_names[0]); for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } else { PADDLE_ENFORCE( @@ -665,7 +667,7 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, CreateComputationalOp(result, node, op_dev_id); if (node->Op()->Type() == "concat") { - ConnectOp(result, result->Get("ops").back().get(), + ConnectOp(result, result->Get(kGraphOps).back().get(), "fetch_barrier"); } } @@ -676,7 +678,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); + op_dev_id = GetVarDeviceID(*result, node->inputs[0]->Name()); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -691,7 +693,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } op_dev_id = GetAppropriateDeviceID(input_var_names); for (auto &varname : input_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } } else if (node->Op()->Type() == "recv") { @@ -701,7 +704,8 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, } op_dev_id = GetAppropriateDeviceID(output_var_names); for (auto &varname : output_var_names) { - var_name_on_devices_.emplace(varname, op_dev_id); + result->Get(kShardedVarDevice) + .emplace(varname, op_dev_id); } } else { // send_barrier and fetch_barrier op can be scheduled on device 0 @@ -711,17 +715,18 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result, PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s", node->Op()->Type()); - result->Get("ops").emplace_back(new RPCOpHandle( + result->Get(kGraphOps).emplace_back(new RPCOpHandle( result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id], node->Op()->Type(), places_[op_dev_id])); + // TODO(panyx0718): This might not be needed anymore. if (node->Op()->Type() == "send_barrier") { - ConnectOp(result, result->Get("ops").back().get(), "send"); + ConnectOp(result, result->Get(kGraphOps).back().get(), "send"); } else if (node->Op()->Type() == "recv") { - ConnectOp(result, result->Get("ops").back().get(), + ConnectOp(result, result->Get(kGraphOps).back().get(), "send_barrier"); } else if (node->Op()->Type() == "fetch_barrier") { - ConnectOp(result, result->Get("ops").back().get(), "recv"); + ConnectOp(result, result->Get(kGraphOps).back().get(), "recv"); } else if (node->Op()->Type() == "send") { // do nothing } else { @@ -743,3 +748,11 @@ bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_pass, + paddle::framework::details::MultiDevSSAGraphBuilder) + .RequirePassAttr(paddle::framework::details::kLossVarName) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kParams) + .RequirePassAttr(paddle::framework::details::kLocalScopes) + .RequirePassAttr(paddle::framework::details::kStrategy); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 55076f227b5ab56d66b5053173c9e915da23b15f..099dbe5abef6458c4613c9f680440734f59cb6e2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -31,39 +31,27 @@ class Scope; namespace details { class MultiDevSSAGraphBuilder : public SSAGraphBuilder { - public: -#ifdef PADDLE_WITH_CUDA - MultiDevSSAGraphBuilder(const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - platform::NCCLContextMap *nccl_ctxs, - const BuildStrategy &strategy); -#else - MultiDevSSAGraphBuilder(const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - const BuildStrategy &strategy); -#endif - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; - int GetVarDeviceID(const std::string &varname) const override; private: void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, size_t device_id) const; + void Init() const; private: - std::string loss_var_name_; - const std::vector &places_; - const std::vector &local_scopes_; - std::unordered_set grad_names_; + mutable std::string loss_var_name_; + mutable std::vector places_; + mutable std::vector local_scopes_; + mutable std::unordered_set grad_names_; #ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap *nccl_ctxs_; + mutable platform::NCCLContextMap *nccl_ctxs_; #endif + int GetVarDeviceID(const ir::Graph &graph, const std::string &varname) const; + bool IsScaleLossOp(ir::Node *node) const; void CreateRPCOp(ir::Graph *result, ir::Node *node) const; @@ -97,7 +85,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::string &og, std::unordered_set *og_has_been_broadcast) const; - int GetOpDeviceID(ir::Node *node) const; + int GetOpDeviceID(const ir::Graph &graph, ir::Node *node) const; void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; @@ -113,9 +101,8 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector &var_names) const; private: - BuildStrategy strategy_; + mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; - mutable std::unordered_map var_name_on_devices_; mutable std::vector balance_vars_; void SetCommunicationContext(OpHandleBase *op_handle, diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index cbfbcb1c0cd24f16773f9633310166371600790c..1b188aec5995edb73835bcf5b851952db0f95f48 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -40,6 +40,9 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { ExecutionStrategy strategy, std::vector local_scopes, std::vector var_infos, std::vector places, std::unique_ptr&& underlying_executor); + + const ir::Graph& Graph() const { return underlying_executor_->Graph(); } + FeedFetchList Run(const std::vector& fetch_tensors) override; private: diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 506e7eb35cd977869424223cb863dd64dbaa9d30..575532540a624afde5f6dab25b11e9eac93c6448 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -18,7 +18,7 @@ namespace paddle { namespace framework { namespace details { void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { - for (auto &var_map : graph->Get("vars")) { + for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { if (name_pair.second.size() <= 1) { continue; @@ -50,7 +50,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); read_op->AddOutput(dep_var); write_op->AddInput(dep_var); - graph->Get("dep_vars").emplace(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); } } } @@ -60,7 +60,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(ir::Graph *graph) { VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( ir::Graph *graph, ir::Node *node, const platform::Place &place, size_t place_offset) { - auto &var_holders = graph->Get("vars")[place_offset]; + auto &var_holders = graph->Get(kGraphVars)[place_offset]; auto &var_holder = var_holders[node->Name()]; VarHandle *var = nullptr; if (var_holder.empty()) { @@ -83,7 +83,8 @@ void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle, ir::Node *new_node, const platform::Place &place, size_t place_offset) { - auto &vars = graph->Get("vars")[place_offset][new_node->Name()]; + auto &vars = + graph->Get(kGraphVars)[place_offset][new_node->Name()]; size_t version = vars.size(); auto var = new VarHandle(new_node, version, place_offset, new_node->Name(), place); @@ -92,12 +93,12 @@ void SSAGraphBuilder::CreateOpOutput(ir::Graph *graph, OpHandleBase *op_handle, } void SSAGraphBuilder::AddOutputToLeafOps(ir::Graph *graph) { - for (auto &op : graph->Get("ops")) { + for (auto &op : graph->Get(kGraphOps)) { if (!op->Outputs().empty()) { continue; } auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get("dep_vars").emplace(dummy_leaf); + graph->Get(kGraphDepVars).emplace(dummy_leaf); op->AddOutput(dummy_leaf); } } diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 2b4f31f2ff3444f909e3be5eb810ae6737e237b2..53a4ad003d51a27a044d7a142434545eca0d5965 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -39,21 +39,25 @@ namespace details { typedef std::vector< std::unordered_map>>> GraphVars; +const char kGraphVars[] = "vars"; // aux variables to represent dependency. Useful to resolve data hazard. typedef std::unordered_set> GraphDepVars; +const char kGraphDepVars[] = "dep_vars"; // all operators. NOTE that even we use a vector here, the operators is // unordered. typedef std::vector> GraphOps; +const char kGraphOps[] = "ops"; + +typedef std::unordered_map ShardedVarDevice; +const char kShardedVarDevice[] = "sharded_var_device"; class SSAGraphBuilder : public ir::Pass { public: SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {} - virtual int GetVarDeviceID(const std::string &var_name) const = 0; - DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); protected: diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc b/paddle/fluid/framework/details/ssa_graph_builder_factory.cc deleted file mode 100644 index b4b49d3de6da2e5fd7836668619e42d10bb6b35a..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.cc +++ /dev/null @@ -1,50 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" -#include -#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" -#include "paddle/fluid/framework/details/ssa_graph_checker.h" -#include "paddle/fluid/framework/details/ssa_graph_printer.h" - -namespace paddle { -namespace framework { -namespace details { -std::unique_ptr SSAGraphBuilderFactory::Create() { - std::unique_ptr res( -#ifdef PADDLE_WITH_CUDA - new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, - local_scopes_, nccl_ctxs_, strategy_) -#else - new MultiDevSSAGraphBuilder(places_, loss_var_name_, param_names_, - local_scopes_, strategy_) -#endif - ); // NOLINT - - if (!strategy_.debug_graphviz_path_.empty()) { - std::unique_ptr fout( - new std::ofstream(strategy_.debug_graphviz_path_)); - PADDLE_ENFORCE(fout->good()); - std::unique_ptr graphviz_printer( - new GraphvizSSAGraphPrinter()); - res.reset(new SSAGraghBuilderWithPrinter( - std::move(fout), std::move(graphviz_printer), std::move(res))); - } - res.reset(new SSAGraghBuilderWithChecker(std::move(res))); - - return res; -} -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder_factory.h b/paddle/fluid/framework/details/ssa_graph_builder_factory.h deleted file mode 100644 index 91a119de83ed3d1573803e48faf86c874eed98d6..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/ssa_graph_builder_factory.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/details/build_strategy.h" -#include "paddle/fluid/framework/details/ssa_graph_builder.h" -#include "paddle/fluid/platform/place.h" - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/nccl_helper.h" -#endif - -namespace paddle { -namespace framework { -class Scope; -namespace details { - -class SSAGraphBuilderFactory { - public: - SSAGraphBuilderFactory(const std::vector& places, - const std::string& loss_var_name, - const std::unordered_set& param_names, - const std::vector& local_scopes, - const BuildStrategy& strategy) - : places_(places), - loss_var_name_(loss_var_name), - param_names_(param_names), - local_scopes_(local_scopes), - strategy_(strategy) { -#ifdef PADDLE_WITH_CUDA - nccl_ctxs_ = nullptr; -#endif - } - -#ifdef PADDLE_WITH_CUDA - void SetNCCLContextMap(platform::NCCLContextMap* nccl_ctxs) { - nccl_ctxs_ = nccl_ctxs; - } -#endif - - std::unique_ptr Create(); - - private: - std::vector places_; - std::string loss_var_name_; - std::unordered_set param_names_; - std::vector local_scopes_; - BuildStrategy strategy_; - -#ifdef PADDLE_WITH_CUDA - platform::NCCLContextMap* nccl_ctxs_; -#endif -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_checker.cc b/paddle/fluid/framework/details/ssa_graph_checker.cc index 0438b096109a287366610d06ef2bd14c765a8f43..b9e1cda1f24810009bc74a7abdf0156f723a1755 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.cc +++ b/paddle/fluid/framework/details/ssa_graph_checker.cc @@ -33,7 +33,7 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } }; - for (auto &var_map : graph->Get("vars")) { + for (auto &var_map : graph->Get(kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { insert_pending_var(version_pair.get()); @@ -41,11 +41,11 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } } - for (auto &var : graph->Get("dep_vars")) { + for (auto &var : graph->Get(kGraphDepVars)) { insert_pending_var(var.get()); } - for (auto &op : graph->Get("ops")) { + for (auto &op : graph->Get(kGraphOps)) { if (op->Inputs().empty()) { ready_ops.insert(op.get()); } else { @@ -85,3 +85,10 @@ bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_check_pass, + paddle::framework::details::SSAGraghBuilderWithChecker) + .RequireGraphAttr(paddle::framework::details::kGraphVars) + .RequireGraphAttr(paddle::framework::details::kGraphDepVars) + .RequireGraphAttr(paddle::framework::details::kGraphOps) + .RequireGraphAttr(paddle::framework::details::kShardedVarDevice); diff --git a/paddle/fluid/framework/details/ssa_graph_checker.h b/paddle/fluid/framework/details/ssa_graph_checker.h index 51ce6e5ecad755613551aa6525b5cfbe4a8933ef..0e861ecb236361992d9883e3bd0e679f7563b539 100644 --- a/paddle/fluid/framework/details/ssa_graph_checker.h +++ b/paddle/fluid/framework/details/ssa_graph_checker.h @@ -23,26 +23,14 @@ namespace framework { namespace details { class SSAGraghBuilderWithChecker : public SSAGraphBuilder { - public: - explicit SSAGraghBuilderWithChecker( - std::unique_ptr&& builder) - : builder_(std::move(builder)) {} - - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { - auto new_graph = builder_->Apply(std::move(graph)); - PADDLE_ENFORCE(IsValidGraph(new_graph.get())); - return new_graph; - } - - int GetVarDeviceID(const std::string& var_name) const override { - return builder_->GetVarDeviceID(var_name); + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; } bool IsValidGraph(const ir::Graph* graph) const; - - private: - std::unique_ptr builder_; }; } // namespace details diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h index 8815ec89b23bc874471eefde5fa855cd2a4bde1f..96fffb7d9430cd00b3823ada9fbe9a65a6bd718c 100644 --- a/paddle/fluid/framework/details/ssa_graph_executor.h +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -32,7 +32,9 @@ class SSAGraphExecutor { virtual ~SSAGraphExecutor(); - virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0; + virtual const ir::Graph& Graph() const = 0; + + virtual FeedFetchList Run(const std::vector& fetch_tensors) = 0; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/ssa_graph_printer.cc b/paddle/fluid/framework/details/ssa_graph_printer.cc index 20aab1464400aa9bb1bd6af11c06269c688a8308..ec3f31ab8d135efd2c77018e90cec46b25ca5e66 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.cc +++ b/paddle/fluid/framework/details/ssa_graph_printer.cc @@ -22,7 +22,7 @@ namespace details { template static inline void IterAllVar(const ir::Graph &graph, Callback callback) { - for (auto &each : graph.Get("vars")) { + for (auto &each : graph.Get(kGraphVars)) { for (auto &pair1 : each) { for (auto &pair2 : pair1.second) { callback(*pair2); @@ -30,7 +30,7 @@ static inline void IterAllVar(const ir::Graph &graph, Callback callback) { } } - for (auto &var : graph.Get("dep_vars")) { + for (auto &var : graph.Get(kGraphDepVars)) { callback(*var); } } @@ -61,7 +61,7 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, }); size_t op_id = 0; - for (auto &op : graph.Get("ops")) { + for (auto &op : graph.Get(kGraphOps)) { std::string op_name = "op_" + std::to_string(op_id++); sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" << std::endl; @@ -81,3 +81,6 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, } // namespace details } // namespace framework } // namespace paddle + +REGISTER_PASS(multi_device_print_pass, + paddle::framework::details::SSAGraghBuilderWithPrinter); diff --git a/paddle/fluid/framework/details/ssa_graph_printer.h b/paddle/fluid/framework/details/ssa_graph_printer.h index a77c1bad3f15bca9064ded860696eb68b033b090..5eafd1805c3102dbd3cdfa68ee1495631c182b51 100644 --- a/paddle/fluid/framework/details/ssa_graph_printer.h +++ b/paddle/fluid/framework/details/ssa_graph_printer.h @@ -14,7 +14,9 @@ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/details/ssa_graph_builder.h" @@ -34,38 +36,15 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter { }; class SSAGraghBuilderWithPrinter : public SSAGraphBuilder { - public: - SSAGraghBuilderWithPrinter(std::ostream& sout, - std::unique_ptr&& printer, - std::unique_ptr&& builder) - : printer_(std::move(printer)), - builder_(std::move(builder)), - stream_ref_(sout) {} - - SSAGraghBuilderWithPrinter(std::unique_ptr&& sout, - std::unique_ptr&& printer, - std::unique_ptr&& builder) - : printer_(std::move(printer)), - builder_(std::move(builder)), - stream_ptr_(std::move(sout)), - stream_ref_(*stream_ptr_) {} - - std::unique_ptr Apply( + protected: + std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { - auto new_graph = builder_->Apply(std::move(graph)); - printer_->Print(*new_graph, stream_ref_); - return new_graph; + std::unique_ptr fout( + new std::ofstream(Get("debug_graphviz_path"))); + PADDLE_ENFORCE(fout->good()); + Get("graph_printer").Print(*graph, *fout); + return graph; } - - int GetVarDeviceID(const std::string& var_name) const override { - return builder_->GetVarDeviceID(var_name); - } - - private: - std::unique_ptr printer_; - std::unique_ptr builder_; - std::unique_ptr stream_ptr_; - std::ostream& stream_ref_; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c19f74476f9a1498a7d61f5faf204e9966aea155..eec405073377b2782d7636c08e6eb3a7bd41202d 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -45,18 +45,18 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( std::unordered_set delayed_ops; // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_->Get("vars")) { + for (auto &var_map : graph_->Get(details::kGraphVars)) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { InsertPendingVar(&pending_vars, &ready_vars, version_pair.get()); } } } - for (auto &var : graph_->Get("dep_vars")) { + for (auto &var : graph_->Get(details::kGraphDepVars)) { InsertPendingVar(&pending_vars, &ready_vars, var.get()); } - for (auto &op : graph_->Get("ops")) { + for (auto &op : graph_->Get(details::kGraphOps)) { if (op->Inputs().empty()) { // Special case, Op has no input. ready_ops.insert(op.get()); } else { @@ -162,7 +162,7 @@ void ThreadedSSAGraphExecutor::InsertFetchOps( std::unordered_map> fetched_vars; for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : graph_->Get("vars")) { + for (auto &var_map : graph_->Get(details::kGraphVars)) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get()); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 3d67daa45e20fdea52689684397ad01f2f4cd783..82d6b5272aba161bb19067ebef054bc4bbb8701c 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -42,6 +42,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { const std::vector &places, std::unique_ptr &&graph); + const ir::Graph &Graph() const { return *graph_; } // Run a SSAGraph by a thread pool // Use topological sort algorithm FeedFetchList Run(const std::vector &fetch_tensors) override; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 6447452ae58344273fe569c91168c7c95a901c8d..bf7d76a8a6e173e648cea5aaba9b7202d787173b 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -1,6 +1,9 @@ cc_library(node SRCS node.cc DEPS proto_desc) cc_library(graph SRCS graph.cc DEPS node) cc_library(graph_helper SRCS graph_helper.cc DEPS graph) -cc_library(pass SRCS pass.cc DEPS graph node) -cc_test(graph_test SRCS graph_test.cc DEPS graph op_registry) -cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph_helper op_registry) +cc_library(pass SRCS pass.cc DEPS graph node graph_helper) +cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper) + +cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) +cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) +cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 740acfafb7594d8d9f3ca5439323ce76c5ed271a..f870fb2b9cf805aba84d6f4573b0574ff361e71c 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -24,6 +24,68 @@ namespace paddle { namespace framework { namespace ir { +std::vector FindDistTrainSendVars( + const std::vector &nodes) { + std::vector send_vars; + // since parameters are all in block 0, + // it's enough to only scan send ops in block 0 + for (auto &node : nodes) { + auto op_vars = node->Op()->InputArgumentNames(); + send_vars.reserve(send_vars.size() + + std::distance(op_vars.begin(), op_vars.end())); + send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end()); + } + return send_vars; +} + +std::vector FindDistTrainRecvVars( + const std::vector &nodes) { + std::vector recv_vars; + for (auto &node : nodes) { + auto op_vars = node->Op()->OutputArgumentNames(); + recv_vars.reserve(recv_vars.size() + + std::distance(op_vars.begin(), op_vars.end())); + recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end()); + } + return recv_vars; +} + +bool IsDistTrainOp(ir::Node *node, const std::vector &send_vars, + const std::vector &recv_vars) { + if (send_vars.size() == 0 || recv_vars.size() == 0) { + return false; + } + + /** + * Check any of opvars contains `.block` and in sendvars + */ + auto checker = [](const std::vector &opvars, + const std::vector &rpc_vars) -> bool { + for (auto &var : opvars) { + // a variable name with the suffix `.block` means it's a splited + // variable by (DistributeTranspiler) + // [python/paddle/fluid/transpiler/distribute_transpiler.py] + if (var.find(".block") != std::string::npos && + std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) { + return true; + } + } + return false; + }; + + std::vector input_var_names; + std::vector output_var_names; + for (ir::Node *input : node->inputs) { + input_var_names.push_back(input->Name()); + } + for (ir::Node *output : node->outputs) { + output_var_names.push_back(output->Name()); + } + + return checker(output_var_names, send_vars) || + checker(input_var_names, recv_vars); +} + Graph::Graph(const ProgramDesc &program) : program_(program) { VLOG(3) << "block in program:" << program_.Size(); std::unordered_map all_vars; @@ -61,6 +123,64 @@ Graph::Graph(const ProgramDesc &program) : program_(program) { var->inputs.push_back(node); } } + + std::vector send_ops; + ir::Node *send_bar = nullptr; + std::vector recv_ops; + ir::Node *fetch_bar = nullptr; + for (ir::Node *node : Nodes()) { + if (node->Name() == "send") { + send_ops.push_back(node); + } else if (node->Name() == "send_barrier") { + PADDLE_ENFORCE(!send_bar, "only has one send barrier"); + send_bar = node; + } else if (node->Name() == "recv") { + recv_ops.push_back(node); + } else if (node->Name() == "fetch_barrier") { + PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier"); + fetch_bar = node; + } + } + if (send_bar) { + for (ir::Node *send : send_ops) { + ir::Node *dep_var = CreateControlDepVar(); + send->outputs.push_back(dep_var); + dep_var->inputs.push_back(send); + send_bar->inputs.push_back(dep_var); + dep_var->outputs.push_back(send_bar); + } + for (ir::Node *recv : recv_ops) { + ir::Node *dep_var = CreateControlDepVar(); + recv->inputs.push_back(dep_var); + dep_var->outputs.push_back(recv); + send_bar->outputs.push_back(dep_var); + dep_var->inputs.push_back(send_bar); + } + } + if (fetch_bar) { + for (ir::Node *recv : recv_ops) { + ir::Node *dep_var = CreateControlDepVar(); + recv->outputs.push_back(dep_var); + dep_var->inputs.push_back(recv); + fetch_bar->inputs.push_back(dep_var); + dep_var->outputs.push_back(fetch_bar); + } + } + + std::vector send_vars = FindDistTrainSendVars(send_ops); + std::vector recv_vars = FindDistTrainRecvVars(recv_ops); + for (ir::Node *node : Nodes()) { + if (IsDistTrainOp(node, send_vars, recv_vars)) { + if (fetch_bar && node->Name() == "concat") { + ir::Node *dep_var = CreateControlDepVar(); + fetch_bar->outputs.push_back(dep_var); + dep_var->inputs.push_back(fetch_bar); + node->inputs.push_back(dep_var); + dep_var->outputs.push_back(node); + } + } + } + /** * We only handle write after read(WAR), since it should not have a write * after write in program. If there are write after write operators, we need diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 4f59ec82a7d1217621c95d9a4a433a9af43e95da..c9d55fbf525a1a476ac469e8e57462169a7db2da 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -40,14 +40,21 @@ class Graph { attr_dels_.clear(); } + bool Has(const std::string &attr_name) const { + return attrs_.find(attr_name) != attrs_.end(); + } + template AttrType &Get(const std::string &attr_name) const { + PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", + attr_name); return *boost::any_cast(attrs_.at(attr_name)); } template void Set(const std::string &attr_name, AttrType *attr) { - PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the graph", + attr_name); attrs_[attr_name] = attr; attr_dels_[attr_name] = [attr, attr_name]() { VLOG(3) << "deleting " << attr_name; diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..8cb812d1388bf74d173a4dc7a99561e730f8e95a --- /dev/null +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { +static const char kGraphVizPath[] = "graph_viz_path"; + +std::unique_ptr GraphVizPass::ApplyImpl( + std::unique_ptr graph) const { + const std::string graph_viz_path = Get(kGraphVizPath); + std::unique_ptr fout(new std::ofstream(graph_viz_path)); + PADDLE_ENFORCE(fout->good()); + std::ostream& sout = *fout; + + size_t var_id = 0; + std::unordered_map vars; + + sout << "digraph G {\n"; + + for (const ir::Node* n : graph->Nodes()) { + if (n->NodeType() != ir::Node::Type::kVariable) continue; + size_t cur_var_id = var_id++; + vars[n] = cur_var_id; + + sout << "var_" << cur_var_id << " [label=\"" << n->Name() << "\"]" + << std::endl; + } + + size_t op_id = 0; + for (const ir::Node* n : graph->Nodes()) { + if (n->NodeType() != ir::Node::Type::kOperation) continue; + std::string op_name = "op_" + std::to_string(op_id++); + sout << op_name << " [label=\"" << n->Name() << "\", shape=rect]" + << std::endl; + for (auto in : n->inputs) { + std::string var_name = "var_" + std::to_string(vars[in]); + sout << var_name << " -> " << op_name << std::endl; + } + + for (auto out : n->outputs) { + std::string var_name = "var_" + std::to_string(vars[out]); + sout << op_name << " -> " << var_name << std::endl; + } + } + + sout << "}\n"; + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) + .RequirePassAttr(paddle::framework::ir::kGraphVizPath); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..1fd8c8a26e9581ccf605d4271a49ec2e90d8b997 --- /dev/null +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class GraphVizPass : public Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index c05d7d0bb54c8ba5938e08f7e8dace8f607d7b89..d7158eba62686be57499df697466797e4034ea8f 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -13,7 +13,34 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { -namespace framework {} // namespace framework +namespace framework { +namespace ir { +std::unique_ptr Pass::Apply(std::unique_ptr graph) const { + PADDLE_ENFORCE(!applied_, "Pass can only Apply() once."); + PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); + for (const std::string& attr : required_pass_attrs_) { + PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), + "Required pass atrribute %s not set.", attr); + } + for (const std::string& attr : required_graph_attrs_) { + PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.", + attr); + } + auto applied_graph = ApplyImpl(std::move(graph)); + // TODO(panyx0718): Add more verifications. + PADDLE_ENFORCE(!HasCircle(*applied_graph), + "Illegal Pass. Generated graph shouldn't has cycle."); + applied_ = true; + return applied_graph; +} + +PassRegistry& PassRegistry::Instance() { + static PassRegistry g_pass_info_map; + return g_pass_info_map; +} +} // namespace ir +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index f52ba788d55ddb9ed27baa3f6ff0a97e52370fe0..0f14083d259172f5b5f1ed80c7d38312d711beb5 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -14,21 +14,187 @@ limitations under the License. */ #pragma once +#include +#include +#include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { namespace ir { +template +struct PassRegistrar; class Pass { public: Pass() = default; - virtual ~Pass() {} + virtual ~Pass() { + for (auto &attr : attrs_) { + if (attr_dels_.find(attr.first) != attr_dels_.end()) { + attr_dels_[attr.first](); + } + } + attrs_.clear(); + attr_dels_.clear(); + } + + std::unique_ptr Apply(std::unique_ptr graph) const; + + // Get a reference to the attributed previously set. + template + AttrType &Get(const std::string &attr_name) const { + PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), + "%s attr not registered for pass.", attr_name); + return *boost::any_cast(attrs_.at(attr_name)); + } + + // Set a pointer to the attribute. Pass takes ownership of the attribute. + template + void Set(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass", + attr_name); + attrs_[attr_name] = attr; + attr_dels_[attr_name] = [attr, attr_name]() { + VLOG(3) << "deleting " << attr_name; + delete attr; + }; + } + + // Set a pointer to the attribute. Pass doesn't take ownership. Caller + // should delete the attribute. + template + void SetNotOwned(const std::string &attr_name, AttrType *attr) { + PADDLE_ENFORCE(attrs_.count(attr_name) == 0); + attrs_[attr_name] = attr; + } + + protected: + virtual std::unique_ptr ApplyImpl( + std::unique_ptr graph) const = 0; + + private: + template + friend struct PassRegistrar; + + void RegisterRequiredPassAttrs(const std::unordered_set &attrs) { + required_pass_attrs_.insert(attrs.begin(), attrs.end()); + } + + void RegisterRequiredGraphAttrs( + const std::unordered_set &attrs) { + required_graph_attrs_.insert(attrs.begin(), attrs.end()); + } + + mutable bool applied_{false}; + std::unordered_set required_pass_attrs_; + std::unordered_set required_graph_attrs_; + std::map attrs_; + std::map> attr_dels_; +}; + +using PassCreator = std::function()>; + +class Registrar { + public: + // In our design, various kinds of passes, + // have their corresponding registry and registrar. The action of + // registration is in the constructor of a global registrar variable, which + // are not used in the code that calls package framework, and would + // be removed from the generated binary file by the linker. To avoid such + // removal, we add Touch to all registrar classes and make USE_PASS macros to + // call this method. So, as long as the callee code calls USE_PASS, the global + // registrar variable won't be removed by the linker. + void Touch() {} +}; - virtual std::unique_ptr Apply(std::unique_ptr graph) const = 0; +class PassRegistry { + public: + static PassRegistry &Instance(); + + bool Has(const std::string &pass_type) const { + return map_.find(pass_type) != map_.end(); + } + + void Insert(const std::string &pass_type, const PassCreator &pass_creator) { + PADDLE_ENFORCE(!Has(pass_type), "Pass %s has been registered", pass_type); + map_.insert({pass_type, pass_creator}); + } + + std::unique_ptr Get(const std::string &pass_type) const { + PADDLE_ENFORCE(Has(pass_type), "Pass %s has not been registered", + pass_type); + return map_.at(pass_type)(); + } + + private: + PassRegistry() = default; + std::unordered_map map_; + + DISABLE_COPY_AND_ASSIGN(PassRegistry); }; + +template +struct PassRegistrar : public Registrar { + explicit PassRegistrar(const char *pass_type) { + PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type), + "'%s' is registered more than once.", pass_type); + PassRegistry::Instance().Insert( + pass_type, [this]() -> std::unique_ptr { + std::unique_ptr pass(new PassType()); + pass->RegisterRequiredPassAttrs(this->required_pass_attrs_); + pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_); + return pass; + }); + } + + PassRegistrar &RequirePassAttr(const std::string &attr) { + required_pass_attrs_.insert(attr); + return *this; + } + + PassRegistrar &RequireGraphAttr(const std::string &attr) { + required_graph_attrs_.insert(attr); + return *this; + } + + private: + std::unordered_set required_pass_attrs_; + std::unordered_set required_graph_attrs_; +}; + +#define STATIC_ASSERT_PASS_GLOBAL_NAMESPACE(uniq_name, msg) \ + struct __test_global_namespace_##uniq_name##__ {}; \ + static_assert(std::is_same<::__test_global_namespace_##uniq_name##__, \ + __test_global_namespace_##uniq_name##__>::value, \ + msg) + +// Register a new pass that can be applied on the IR. +#define REGISTER_PASS(pass_type, pass_class) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __reg_pass__##pass_type, \ + "REGISTER_PASS must be called in global namespace"); \ + static ::paddle::framework::ir::PassRegistrar \ + __pass_registrar_##pass_type##__(#pass_type); \ + int TouchPassRegistrar_##pass_type() { \ + __pass_registrar_##pass_type##__.Touch(); \ + return 0; \ + } \ + static ::paddle::framework::ir::PassRegistrar \ + &__pass_tmp_registrar_##pass_type##__ __attribute__((unused)) = \ + __pass_registrar_##pass_type##__ + +#define USE_PASS(pass_type) \ + STATIC_ASSERT_PASS_GLOBAL_NAMESPACE( \ + __use_pass_itself_##pass_type, \ + "USE_PASS must be called in global namespace"); \ + extern int TouchPassRegistrar_##pass_type(); \ + static int use_pass_itself_##pass_type##_ __attribute__((unused)) = \ + TouchPassRegistrar_##pass_type() + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b5011412ed39e033a7a65921e9c64ce2d54c638 --- /dev/null +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/pass.h" +#include +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace ir { +void BuildCircleGraph(Graph* g) { + ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation); + ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation); + ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable); + ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable); + + o1->outputs.push_back(v1); + o2->inputs.push_back(v1); + v1->inputs.push_back(o1); + v1->outputs.push_back(o2); + + o2->outputs.push_back(v2); + o1->inputs.push_back(v2); + v2->inputs.push_back(o2); + v2->outputs.push_back(o1); +} + +class TestPass : public Pass { + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + graph->Set("copy_test_pass_attr", new int); + graph->Set("copy_test_graph_attr", new int); + + int test_pass_attr = this->Get("test_pass_attr"); + graph->Get("copy_test_pass_attr") = test_pass_attr + 1; + + int test_graph_attr = graph->Get("test_graph_attr"); + graph->Get("copy_test_graph_attr") = test_graph_attr + 1; + return graph; + } +}; + +TEST(PassTest, TestPassAttrCheck) { + ProgramDesc prog; + auto pass = PassRegistry::Instance().Get("test_pass"); + std::unique_ptr graph(new Graph(prog)); + std::string exception; + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("test_pass_attr not set") != exception.npos); + + int val = 1; + graph.reset(new Graph(prog)); + pass->SetNotOwned("test_pass_attr", &val); + + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("test_graph_attr not set") != exception.npos); + + graph.reset(new Graph(prog)); + graph->Set("test_graph_attr", new int); + graph->Get("test_graph_attr") = 1; + graph = pass->Apply(std::move(graph)); + ASSERT_EQ(graph->Get("copy_test_pass_attr"), 2); + ASSERT_EQ(graph->Get("copy_test_graph_attr"), 2); + + try { + graph = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos); + + pass = PassRegistry::Instance().Get("test_pass"); + pass->SetNotOwned("test_pass_attr", &val); + graph.reset(new Graph(prog)); + BuildCircleGraph(graph.get()); + graph->Set("test_graph_attr", new int); + graph->Get("test_graph_attr") = 2; + try { + auto tmp = pass->Apply(std::move(graph)); + } catch (paddle::platform::EnforceNotMet e) { + exception = std::string(e.what()); + } + ASSERT_TRUE(exception.find("shouldn't has cycle") != exception.npos); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(test_pass, paddle::framework::ir::TestPass) + .RequirePassAttr("test_pass_attr") + .RequireGraphAttr("test_graph_attr"); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index d1dc5fcd97b77fb7707c7d48f6eaeef140d3f306..7c1c29fd9a81c558f7fd05abf52cd0a6dd522190 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -679,6 +679,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, if (var == nullptr) continue; if (var->IsType()) { CheckTensorNANOrInf(vname, var->Get()); + } else if (var->IsType()) { + CheckTensorNANOrInf(vname, var->Get().value()); } } } diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 02c836bea194553bb9c4bc5677cc408dd302e9ce..b5f01a9a2b76472063658f1a051a2ee3c65559b7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -19,19 +19,80 @@ limitations under the License. */ #include #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" #endif #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" -#include "paddle/fluid/framework/details/ssa_graph_builder_factory.h" +#include "paddle/fluid/framework/details/ssa_graph_checker.h" +#include "paddle/fluid/framework/details/ssa_graph_printer.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace framework { +std::unique_ptr ApplyParallelExecutorPass( + const ProgramDesc &main_program, const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶m_names, + const std::vector &local_scopes, const bool use_cuda, +#ifdef PADDLE_WITH_CUDA + const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) { +#else + const BuildStrategy &strategy) { +#endif + // Convert the program to graph. + std::unique_ptr graph(new ir::Graph(main_program)); + + // Apply a graph viz pass to record a graph. + if (!strategy.debug_graphviz_path_.empty()) { + auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); + const std::string graph_path = string::Sprintf( + "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph"); + viz_pass->Set("graph_viz_path", new std::string(graph_path)); + graph = viz_pass->Apply(std::move(graph)); + } + + // Convert graph to run on multi-devices. + auto multi_device_pass = + ir::PassRegistry::Instance().Get("multi_device_pass"); + multi_device_pass->SetNotOwned>("places", + &places); + multi_device_pass->SetNotOwned("loss_var_name", + &loss_var_name); + multi_device_pass->SetNotOwned>( + "params", ¶m_names); + multi_device_pass->SetNotOwned>("local_scopes", + &local_scopes); + multi_device_pass->SetNotOwned("strategy", &strategy); + +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + multi_device_pass->SetNotOwned("nccl_ctxs", nctx); +#endif + graph = multi_device_pass->Apply(std::move(graph)); + + // Apply a graph print pass to record a graph with device info. + if (!strategy.debug_graphviz_path_.empty()) { + auto multi_device_print_pass = + ir::PassRegistry::Instance().Get("multi_device_print_pass"); + multi_device_print_pass->SetNotOwned( + "debug_graphviz_path", &strategy.debug_graphviz_path_); + multi_device_print_pass->Set( + "graph_printer", new details::GraphvizSSAGraphPrinter); + graph = multi_device_print_pass->Apply(std::move(graph)); + } + + // Verify that the graph is correct for multi-device executor. + auto multi_device_check_pass = + ir::PassRegistry::Instance().Get("multi_device_check_pass"); + graph = multi_device_check_pass->Apply(std::move(graph)); + return graph; +} + class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) @@ -119,21 +180,19 @@ ParallelExecutor::ParallelExecutor( var_infos.back().persistable_ = var->Persistable(); } - // Step 3. Convert main_program to SSA form and dependency graph. Also, insert - // ncclOp - details::SSAGraphBuilderFactory builder_factory( - member_->places_, loss_var_name, params, member_->local_scopes_, - build_strategy); - if (member_->use_cuda_) { +// Step 3. Convert main_program to SSA form and dependency graph. Also, insert +// ncclOp #ifdef PADDLE_WITH_CUDA - builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get()); + std::unique_ptr graph = ApplyParallelExecutorPass( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, build_strategy, + member_->nccl_ctxs_.get()); #else - PADDLE_THROW("Not compiled with CUDA."); + std::unique_ptr graph = ApplyParallelExecutorPass( + main_program, member_->places_, loss_var_name, params, + member_->local_scopes_, member_->use_cuda_, build_strategy); #endif - } - builder_ = builder_factory.Create(); - std::unique_ptr graph(new ir::Graph(main_program)); - graph = builder_->Apply(std::move(graph)); + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( exec_strategy, member_->local_scopes_, places, std::move(graph))); member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -146,11 +205,18 @@ void ParallelExecutor::BCastParamsToDevices( // the initializing bcast, all vars would be bcast from device(0), // otherwise // bcast from the specified device. - bool initializing = builder_.get() == nullptr ? true : false; - + bool initializing = member_->executor_ ? false : true; for (auto &var : vars) { - int var_dev_id = - builder_.get() == nullptr ? -1 : builder_->GetVarDeviceID(var); + int var_dev_id = -1; + if (member_->executor_) { + auto &sharded_var_device = + member_->executor_->Graph().Get( + details::kShardedVarDevice); + if (sharded_var_device.find(var) != sharded_var_device.end()) { + var_dev_id = sharded_var_device.at(var); + } + } + if (!initializing && var_dev_id == -1) continue; framework::Variable *main_var = nullptr; @@ -286,3 +352,8 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle + +USE_PASS(graph_viz_pass); +USE_PASS(multi_device_pass); +USE_PASS(multi_device_check_pass); +USE_PASS(multi_device_print_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ffb9934a2d702b2bf6db7ad75a6bf9867e1e9901..d624956acde86cefc4ec1dec80df3738bcf1d8be 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -70,7 +70,6 @@ class ParallelExecutor { private: ParallelExecutorPrivate *member_; - std::unique_ptr builder_; }; } // namespace framework diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 16c9c81258a9fdb7730b9b3e34be990798c91639..ba7645aa02413f28a648f35e381da7824604a455 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -14,8 +14,15 @@ cc_library(paddle_fluid_api get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) +# paddle_fluid_origin exclude inference api interface +cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) + +if(NOT APPLE) + add_subdirectory(api) +endif() + # Create static library -cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api) +cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api) if(NOT APPLE) # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym") @@ -24,7 +31,7 @@ endif() # Create shared library cc_library(paddle_fluid_shared SHARED - SRCS io.cc + SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc DEPS ${fluid_modules} paddle_fluid_api) set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid) @@ -32,12 +39,21 @@ if(NOT APPLE) # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac. set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map") set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") + # check symbol hidden + FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake + "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" + " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n" + "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n" + " message(FATAL_ERROR \"Check symbol failed.\")\n" + "endif()\n") + add_custom_command( + OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake" + DEPENDS paddle_fluid_shared) + add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") endif() if(WITH_TESTING) # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book add_subdirectory(tests/book) endif() -if(NOT APPLE) - add_subdirectory(api) -endif() diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 7e4b3e9a2dcae6b34d1af089bc7da55e09315c58..3e60a61793339990648737c3d549d46cc5f5a887 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -42,35 +42,8 @@ function(inference_api_test TARGET_NAME) endif(WITH_TESTING) endfunction(inference_api_test) -cc_library(paddle_inference_api - SRCS api.cc api_impl.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) -if(NOT APPLE) - set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/api.sym") - set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}") -endif() - -# Here the shared library doesn't depend on other fluid libraries, or double free will occur. -cc_library(paddle_inference_api_shared SHARED - SRCS api.cc api_impl.cc) -add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) -set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api) +cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor) -if(NOT APPLE) - set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/api.map") - set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}") - FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake - "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh" - " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n" - "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n" - " message(FATAL_ERROR \"Check symbol failed.\")\n" - "endif()\n") - add_custom_command( - OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol" - COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake" - DEPENDS paddle_inference_api_shared) - add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol") -endif() cc_test(test_paddle_inference_api SRCS api_tester.cc diff --git a/paddle/fluid/inference/api/api.map b/paddle/fluid/inference/api/api.map deleted file mode 100644 index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/api/api.map +++ /dev/null @@ -1,6 +0,0 @@ -{ - global: - *paddle*; - local: - *; -}; diff --git a/paddle/fluid/inference/api/api.sym b/paddle/fluid/inference/api/api.sym deleted file mode 100644 index ef2a04d788aa86b7f6a61c4af479d70d1137f374..0000000000000000000000000000000000000000 --- a/paddle/fluid/inference/api/api.sym +++ /dev/null @@ -1 +0,0 @@ -*paddle* diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 7f9bb4b33e97b5ea37e9216b00ce0c82ca3ce230..ba73a6eaa6fc885b6b56c2d6330394e2f9c384bf 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -55,11 +55,9 @@ endif() # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a if(WITH_STATIC_LIB) set(DEPS - ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.a ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a) else() set(DEPS - ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.so ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so) endif() set(EXTERNAL_LIB "-lrt -ldl -lpthread") diff --git a/paddle/fluid/inference/api/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh similarity index 64% rename from paddle/fluid/inference/api/check_symbol.sh rename to paddle/fluid/inference/check_symbol.sh index 6547ca1413649968e8a0be146915e07192a99898..12b7b3e7e5982f193e48596b867953fc93841b61 100755 --- a/paddle/fluid/inference/api/check_symbol.sh +++ b/paddle/fluid/inference/check_symbol.sh @@ -3,8 +3,8 @@ lib=$1 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi -num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l) -num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l) +num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l) +num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep T | wc -l) if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 748f5a084e8c880df215a60fe51c835ba5cd3110..3864f337bdadc61e7531304e2cf2ee52a25253f2 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -1,7 +1,7 @@ # Add TRT tests nv_library(tensorrt_converter - SRCS mul_op.cc conv2d_op.cc fc_op.cc - DEPS tensorrt_engine mul_op) + SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc + DEPS tensorrt_engine operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter) @@ -13,3 +13,6 @@ nv_test(test_trt_fc_op SRCS test_fc_op.cc fc_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine mul_op SERIAL) nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc DEPS ${FLUID_CORE_MODULES} tensorrt_engine activation_op SERIAL) + +nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc + DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL) diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc index 3c342957360ad4192d838147bf37e84d233c2629..514eb659a8da73b6e56b5d17148ec0cb2aeaa135 100644 --- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc @@ -49,5 +49,4 @@ class MulOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(mul); REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..11cad95361867476c6f775af778015da37f1cfb1 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights. + */ +class Pool2dOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) + << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1); + PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1); + auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]); + + std::string pool_type = + boost::get(op_desc.GetAttr("pooling_type")); + std::vector ksize = + boost::get>(op_desc.GetAttr("ksize")); + std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + + const nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); + const nvinfer1::DimsHW nv_strides(strides[0], strides[1]); + const nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); + + PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL); + + nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX; + if (pool_type == "max") { + nv_pool_type = nvinfer1::PoolingType::kMAX; + } else if (pool_type == "avg") { + nv_pool_type = nvinfer1::PoolingType::kAVERAGE; + } else { + PADDLE_THROW("TensorRT unsupported pooling type!"); + } + + auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, + *const_cast(input1), + nv_pool_type, nv_ksize); + PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created."); + layer->setStride(nv_strides); + layer->setPadding(nv_paddings); + + auto output_name = op_desc.Output("Out")[0]; + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pool2d); +REGISTER_TRT_OP_CONVERTER(pool2d, Pool2dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc index 7dabfd9f6a9a8cfbdd1d9a66541180d3499b7bdc..e82762ea03ecd00bce7cfb83b130a3436ccbfed3 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_activation_op.cc @@ -37,7 +37,7 @@ TEST(ReluOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(1); + validator.Execute(5); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc index 081f4d605975f1408d4d8a8ed3108c04d837a4de..1ae2668e733aad23241c63b9985e708396d0b1bc 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_fc_op.cc @@ -24,9 +24,8 @@ TEST(fc_op, test) { std::unordered_set parameters({"mul-Y"}); framework::Scope scope; TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("mul-X", nvinfer1::Dims4(1, 10, 1, 1)); + validator.DeclInputVar("mul-X", nvinfer1::Dims3(10, 1, 1)); validator.DeclParamVar("mul-Y", nvinfer1::Dims2(10, 2)); - // validator.DeclParamVar("mul-Y", nvinfer1::Dims2(8, 2)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(1, 2)); // Prepare Op description @@ -38,7 +37,7 @@ TEST(fc_op, test) { validator.SetOp(*desc.Proto()); - validator.Execute(1); + validator.Execute(10); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc index 674f37f2fdddf013a8f6f4671debbc19c3322423..3d34cd7d5d0deca4d83a3f5b5ed0fb396c6acd56 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_mul_op.cc @@ -23,7 +23,7 @@ namespace tensorrt { TEST(MulOpConverter, main) { framework::Scope scope; std::unordered_set parameters; - TRTConvertValidation validator(10, parameters, scope, 1000); + TRTConvertValidation validator(10, parameters, scope, 1000, false); validator.DeclInputVar("mul-X", nvinfer1::Dims2(10, 6)); validator.DeclInputVar("mul-Y", nvinfer1::Dims2(6, 10)); validator.DeclOutputVar("mul-Out", nvinfer1::Dims2(10, 10)); @@ -39,7 +39,7 @@ TEST(MulOpConverter, main) { validator.SetOp(*desc.Proto()); LOG(INFO) << "execute"; - validator.Execute(1); + validator.Execute(2); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c5dddbc8cd37b9fb1ba39382af2da5ad045f3af2 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(Pool2dOpConverter, main) { + framework::Scope scope; + std::unordered_set parameters; + TRTConvertValidation validator(5, parameters, scope, 1 << 15); + + // The ITensor's Dims should not contain the batch size. + // So, the ITensor's Dims of input and output should be C * H * W. + validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 4, 4)); + validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("pool2d"); + desc.SetInput("X", {"pool2d-X"}); + desc.SetOutput("Out", {"pool2d-Out"}); + + std::vector ksize({2, 2}); + std::vector strides({2, 2}); + std::vector paddings({0, 0}); + std::string pooling_t = "max"; + + desc.SetAttr("pooling_type", pooling_t); + desc.SetAttr("ksize", ksize); + desc.SetAttr("strides", strides); + desc.SetAttr("paddings", paddings); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(3); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pool2d); diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index f14885b238134cdf38a278cd8a0734947bcacfe0..39529cc2c799212f91107b1b86dd2c8c3642b6da 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -63,13 +63,16 @@ class TRTConvertValidation { public: TRTConvertValidation() = delete; - TRTConvertValidation(int batch_size, + TRTConvertValidation(int max_batch_size, const std::unordered_set& parameters, framework::Scope& scope, // NOLINT - int workspace_size = 1 << 10) - : parameters_(parameters), scope_(scope) { + int workspace_size = 1 << 10, bool if_add_batch = true) + : parameters_(parameters), + scope_(scope), + if_add_batch_(if_add_batch), + max_batch_size_(max_batch_size) { // create engine. - engine_.reset(new TensorRTEngine(batch_size, workspace_size, &stream_)); + engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, &stream_)); engine_->InitNetwork(); PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); @@ -84,7 +87,7 @@ class TRTConvertValidation { // Declare a parameter varaible in the scope. void DeclParamVar(const std::string& name, const nvinfer1::Dims& dims) { - DeclVar(name, dims); + DeclVar(name, dims, true); } void DeclOutputVar(const std::string& name, const nvinfer1::Dims& dims) { @@ -92,12 +95,18 @@ class TRTConvertValidation { } // Declare a variable in a fluid Scope. - void DeclVar(const std::string& name, const nvinfer1::Dims& dims) { + void DeclVar(const std::string& name, const nvinfer1::Dims& dims, + bool is_param = false) { platform::CPUPlace place; platform::CPUDeviceContext ctx(place); // Init Fluid tensor. std::vector dim_vec(dims.d, dims.d + dims.nbDims); + // There is no batchsize in ITensor's shape, but We should add it to + // tensor's shape of fluid. If the variable is not parameter and the + // if_add_batch_ flag is true, add the max batchsize to dim_vec. + if (is_param != true && if_add_batch_ == true) + dim_vec.insert(dim_vec.begin(), max_batch_size_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); @@ -131,6 +140,7 @@ class TRTConvertValidation { void Execute(int batch_size) { // Execute Fluid Op + PADDLE_ENFORCE_LE(batch_size, max_batch_size_); platform::CPUPlace place; platform::CPUDeviceContext ctx(place); op_->Run(scope_, place); @@ -149,9 +159,15 @@ class TRTConvertValidation { auto* var = scope_.FindVar(output); auto tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &fluid_out); + + size_t fluid_out_size = fluid_out.size(); + if (if_add_batch_ == true) { + fluid_out_size = + batch_size * (framework::product(tensor->dims()) / max_batch_size_); + } // Compare two output ASSERT_FALSE(fluid_out.empty()); - for (size_t i = 0; i < fluid_out.size(); i++) { + for (size_t i = 0; i < fluid_out_size; i++) { // Loose the threshold for CI in different machine model. EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); } @@ -167,6 +183,12 @@ class TRTConvertValidation { std::unique_ptr op_desc_; const std::unordered_set& parameters_; framework::Scope& scope_; + // The ITensor of trt does not cotain the batch size, + // bug, in most cases, we need to set batch size for + // fluid's tensor shape. This variable indicates + // whether to add batch size to tensor shape of fluid. + bool if_add_batch_; + int max_batch_size_; }; } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index f8732e51b66bdc78aa35d06ba9651f1942a74b01..dc03702990587bf5e65d28da662d10df4d882110 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -113,7 +113,7 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { ASSERT_EQ(y_cpu[1], 14.5); } -TEST_F(TensorRTEngineTest, test_conv2d_temp) { +TEST_F(TensorRTEngineTest, test_conv2d) { // Weight in CPU memory. float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float raw_bias[1] = {0}; @@ -146,6 +146,37 @@ TEST_F(TensorRTEngineTest, test_conv2d_temp) { ASSERT_EQ(y_cpu[1], 6.0); } +TEST_F(TensorRTEngineTest, test_pool2d) { + // Weight in CPU memory. + auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + nvinfer1::Dims3{1, 2, 2}); + + nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; + auto* pool_layer = + TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast(x), + pool_t, nvinfer1::DimsHW{2, 2}); + + PADDLE_ENFORCE(pool_layer != nullptr); + pool_layer->setStride(nvinfer1::DimsHW{1, 1}); + pool_layer->setPadding(nvinfer1::DimsHW{0, 0}); + + engine_->DeclareOutput(pool_layer, 0, "y"); + engine_->FreezeNetwork(); + ASSERT_EQ(engine_->engine()->getNbBindings(), 2); + + float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; + engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), + 8 * sizeof(float)); + engine_->Execute(2); + + LOG(INFO) << "to get output"; + float* y_cpu = new float[2]; + engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + + ASSERT_EQ(y_cpu[0], 2.0); + ASSERT_EQ(y_cpu[1], 5.0); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt index 2fa5a9540ba1311c7f87e6675a53044b23dd8276..017fc4cd7b11c150cb941fffca2606a4d707330f 100644 --- a/paddle/fluid/inference/tests/book/CMakeLists.txt +++ b/paddle/fluid/inference/tests/book/CMakeLists.txt @@ -17,7 +17,7 @@ function(inference_test TARGET_NAME) string(REGEX REPLACE "^_$" "" arg "${arg}") cc_test(test_inference_${TARGET_NAME}${arg} SRCS test_inference_${TARGET_NAME}.cc - DEPS paddle_fluid + DEPS paddle_fluid_origin ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model) set_tests_properties(test_inference_${TARGET_NAME}${arg} PROPERTIES DEPENDS test_${TARGET_NAME}) @@ -43,6 +43,6 @@ inference_test(word2vec) # TODO(TJ): clean me up cc_test(test_inference_nlp SRCS test_inference_nlp.cc - DEPS paddle_fluid + DEPS paddle_fluid_origin ARGS --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model) diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index 5cc1db12bb71e428d493e7c6f718b1c6ed431858..e2a3e9d46ef9f303d191d59253ffbe9f4826184b 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -20,9 +20,6 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/platform/cpu_helper.h" -#ifdef PADDLE_WITH_MKLML -#include -#endif DEFINE_string(model_path, "", "Directory of the inference model."); DEFINE_string(data_file, "", "File of input index data."); @@ -30,6 +27,7 @@ DEFINE_int32(repeat, 100, "Running the inference program repeat times"); DEFINE_bool(prepare_vars, true, "Prepare variables before executor"); DEFINE_int32(num_threads, 1, "Number of threads should be used"); DECLARE_bool(use_mkldnn); +DECLARE_int32(paddle_num_threads); inline double GetCurrentMs() { struct timeval time; @@ -160,12 +158,7 @@ TEST(inference, nlp) { std::unique_ptr scope( new paddle::framework::Scope()); -#ifdef PADDLE_WITH_MKLML - // only use 1 thread number per std::thread - omp_set_dynamic(0); - omp_set_num_threads(1); - paddle::platform::SetNumThreads(1); -#endif + paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); double start_ms = 0, stop_ms = 0; if (FLAGS_num_threads > 1) { diff --git a/paddle/fluid/operators/.flatten_op.cc.swp b/paddle/fluid/operators/.flatten_op.cc.swp new file mode 100644 index 0000000000000000000000000000000000000000..3395b6074b6a4c684a97674af702ca8b91dc85e9 Binary files /dev/null and b/paddle/fluid/operators/.flatten_op.cc.swp differ diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9b56ad4c55e35d497aa7abe4e1da3867a2084b88..4c3b8ec78190723598a56f7633764f10dd5047f3 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -270,6 +270,9 @@ op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) op_library(unsqueeze_op DEPS reshape_op) op_library(squeeze_op DEPS reshape_op) +op_library(extract_rows_op DEPS memory) +op_library(flatten_op DEPS reshape_op) + if (WITH_GPU) op_library(conv_op DEPS vol2col depthwise_conv im2col) diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 1612927055dd4ec5ee2220bc2b285e8d9b640ea8..da5d20505e9b06c0717af8d79d5456a9ade1e89c 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -17,9 +17,9 @@ if(WITH_GRPC) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc - DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) - cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op SERIAL) + DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + cc_test(rpc_server_test SRCS rpc_server_test.cc + DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) return() endif() diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index 265f964ddc682868c64669744b130aebbbf86692..b4f60c9ff9a41d5cb7dbe4e7a7694a84bab8e940 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -49,6 +49,7 @@ void GRPCClient::SendComplete() { } GRPCClient::~GRPCClient() { + stopped_ = true; Wait(); cq_.Shutdown(); { @@ -275,7 +276,7 @@ void GRPCClient::Proceed() { void* tag = nullptr; bool ok = false; - while (cq_.Next(&tag, &ok)) { + while (!stopped_ && cq_.Next(&tag, &ok)) { BaseProcessor* c = static_cast(tag); GPR_ASSERT(ok); PADDLE_ENFORCE(c); diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index 8351d825f817437e1b3691e916952dd9a86af491..0c95ffeb5ce7e1586c5968fb122acd12c0c0196e 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor { class GRPCClient : public RPCClient { public: - GRPCClient() : ok_(true), completed_(false) {} + GRPCClient() : ok_(true), completed_(false), stopped_(false) {} virtual ~GRPCClient(); bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, @@ -237,6 +237,8 @@ class GRPCClient : public RPCClient { // mutex for sending complete message only once std::mutex completed_mutex_; bool completed_; + + volatile bool stopped_; }; } // namespace distributed diff --git a/paddle/fluid/operators/distributed/rpc_server_test.cc b/paddle/fluid/operators/distributed/rpc_server_test.cc index 9f2360ec70d2ce5d4e16435595e109c1bf04fd13..b50830c362d3f6ecf38affbfa6a1ffe2ed77e125 100644 --- a/paddle/fluid/operators/distributed/rpc_server_test.cc +++ b/paddle/fluid/operators/distributed/rpc_server_test.cc @@ -30,7 +30,7 @@ namespace framework = paddle::framework; namespace platform = paddle::platform; namespace distributed = paddle::operators::distributed; -USE_OP(lookup_table); +USE_NO_KERNEL_OP(lookup_sparse_table); std::unique_ptr g_rpc_service; std::unique_ptr g_req_handler; @@ -42,13 +42,13 @@ framework::BlockDesc* AppendPrefetchBlcok(framework::ProgramDesc* program) { framework::VariableNameMap input({{"W", {"w"}}, {"Ids", {"ids"}}}); framework::VariableNameMap output({{"Output", {"out"}}}); auto op = block->AppendOp(); - op->SetType("lookup_table"); + op->SetType("lookup_sparse_table"); op->SetInput("W", {"w"}); op->SetInput("Ids", {"ids"}); op->SetOutput("Out", {"out"}); auto& out = *root_block->Var("out"); - out.SetType(framework::proto::VarType::SELECTED_ROWS); + out.SetType(framework::proto::VarType::LOD_TENSOR); out.SetShape({10, 10}); return block; @@ -59,20 +59,19 @@ void CreateVarsOnScope(framework::Scope* scope, platform::CPUPlace* place) { w_var->GetMutable(); auto out_var = scope->Var("out"); - out_var->GetMutable(); + out_var->GetMutable(); auto ids_var = scope->Var("ids"); - ids_var->GetMutable(); + ids_var->GetMutable(); } void InitTensorsOnClient(framework::Scope* scope, platform::CPUPlace* place, int64_t rows_numel) { CreateVarsOnScope(scope, place); - auto ids_var = scope->Var("ids")->GetMutable(); - auto rows = ids_var->mutable_rows(); - for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i * 2); - ids_var->mutable_value()->Resize({rows_numel, 1}); - ids_var->mutable_value()->mutable_data(*place); + auto ids_var = scope->Var("ids")->GetMutable(); + int64_t* ids_ptr = + ids_var->mutable_data(framework::DDim({rows_numel, 1}), *place); + for (int64_t i = 0; i < rows_numel; ++i) ids_ptr[i] = i * 2; } void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place, @@ -148,11 +147,11 @@ TEST(PREFETCH, CPU) { client->AsyncPrefetchVar(ep, ctx, scope, in_var_name, out_var_name); client->Wait(); auto var = scope.Var(out_var_name); - auto value = var->GetMutable()->value(); - auto ptr = value.mutable_data(place); + auto value = var->GetMutable(); + auto ptr = value->mutable_data(place); for (int64_t i = 0; i < rows_numel; ++i) { - EXPECT_EQ(ptr[0 + i * value.dims()[1]], static_cast(i * 2)); + EXPECT_EQ(ptr[0 + i * value->dims()[1]], static_cast(i * 2)); } } diff --git a/paddle/fluid/operators/extract_rows_op.cc b/paddle/fluid/operators/extract_rows_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9a297d03cfb041e584159a5fc5ba214f8ac404b4 --- /dev/null +++ b/paddle/fluid/operators/extract_rows_op.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class ExtractRowsOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ExtractRowsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ExtractRowsOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("X")[0], + framework::proto::VarType::SELECTED_ROWS, + "The type of input(X) must be SelectedRows."); + auto in_dims = ctx->GetInputDim("X"); + + ctx->SetOutputDim( + "Out", framework::make_ddim(std::vector{in_dims[0], 1})); + } +}; + +class ExtractRowsOp : public framework::OperatorBase { + public: + ExtractRowsOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &in = scope.FindVar(Input("X"))->Get(); + auto out = scope.FindVar(Output("Out"))->GetMutable(); + + auto in_rows = in.rows(); + auto out_dim = framework::make_ddim( + std::vector{static_cast(in_rows.size()), 1}); + auto dst_ptr = out->mutable_data(out_dim, in.place()); + + if (paddle::platform::is_gpu_place(in.place())) { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto *dev_ctx = pool.Get(in.place()); + auto src_ptr = in_rows.Data(in.place()); + auto stream = + reinterpret_cast(*dev_ctx) + .stream(); + memory::Copy(boost::get(out->place()), dst_ptr, + boost::get(in.place()), src_ptr, + in_rows.size() * sizeof(int64_t), stream); +#else + PADDLE_THROW("Not compiled with CUDA."); +#endif + } else { + memory::Copy(platform::CPUPlace(), dst_ptr, platform::CPUPlace(), + in_rows.data(), in_rows.size() * sizeof(int64_t)); + } + } +}; + +class ExtractRowsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(SelectedRows). The input tensor of extract_rows operator," + " and its type is SelectedRows."); + AddOutput("Out", "(Tensor). The the rows of input(X)."); + + AddComment(R"DOC( + ExtractRows Operator. + +The function of extract_rows_op is extracting the rows from the input(X) +whose type is SelectedRows. + + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(extract_rows, ops::ExtractRowsOp, ops::ExtractRowsOpMaker, + ops::ExtractRowsOpInferShape); diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fdda01381e117cecffb2a05f8399f3ad82a46339 --- /dev/null +++ b/paddle/fluid/operators/flatten_op.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class FlattenOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input (X) of Flatten op should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output (Output) of Flatten op should not be null."); + const auto &axis = ctx->Attrs().Get("axis"); + const auto &in_dims = ctx->GetInputDim("X"); + PADDLE_ENFORCE(axis >= 0, "The axis should be greater than or equal to 0."); + PADDLE_ENFORCE( + axis <= in_dims.size(), + "The axis should be less than or equal to input tensor's rank."); + + const auto &out_dims = GetOutputShape(axis, in_dims); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + if (in_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", "Out"); + } + } + + static std::vector GetOutputShape(const int axis, + const framework::DDim &in_dims) { + int64_t outer = 1, inner = 1; + for (int i = 0; i < in_dims.size(); ++i) { + if (i < axis) { + outer *= in_dims[i]; + } else { + inner *= in_dims[i]; + } + } + std::vector out_shape(2); + out_shape[0] = outer; + out_shape[1] = inner; + return out_shape; + } +}; + +class FlattenOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto &axis = Attr("axis"); + auto in_dims = + scope.FindVar(Input("X"))->Get().dims(); + const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims); + + framework::AttributeMap attrs; + attrs["shape"] = out_dims; + attrs["inplace"] = false; + // Invoke Reshape Op + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape", {{"X", {Input("X")}}, {"Shape", {}}}, + {{"Out", {Output("Out")}}}, attrs); + reshape_op->Run(scope, place); + } +}; + +class FlattenOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) A tensor of rank >= axis."); + AddOutput("Out", + "A 2D tensor is reshaped input tensor. The input dimensions" + "up to axis are flattened to the outer dimension of the output" + "and the remaining input dimensions are flattened into the inner" + "dimension of the output."); + AddAttr("axis", + "(int)" + "Indicate up to which input dimensions (exclusive) should be" + "flattened to the outer dimension of the output. The value" + "for axis must be in the range [0, R], where R is the rank of" + "the input tensor. When axis = 0, the shape of the output" + "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the" + "input tensor is (d_0, d_1, ... d_n).") + .SetDefault(1); + AddComment(R"DOC( +Flatten Operator + +Flattens the input tensor into a 2D matrix. + +Examples: +Case 1: + Given + X.shape = (3, 100, 100, 4) + and + axis = 2 + We get: + Out.shape = (3 * 100, 4 * 100) + +Case 2: + Given + X.shape = (3, 100, 100, 4) + and + axis = 0 + We get: + Out.shape = (1, 3 * 100 * 100 * 4) +)DOC"); + } +}; + +class FlattenGradInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim(framework::GradVarName("X"), + context->GetInputDim("X")); + context->ShareLoD("X", framework::GradVarName("X")); + } +}; + +class FlattenGradOp : public framework::OperatorBase { + public: + using OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope &scope, + const platform::Place &place) const override { + auto dx_name = Output(framework::GradVarName("X")); + auto dout_name = Input(framework::GradVarName("Out")); + auto in_dims = + scope.FindVar(Input("X"))->Get().dims(); + framework::AttributeMap attrs; + attrs["shape"] = framework::vectorize2int(in_dims); + attrs["inplace"] = false; + + auto reshape_op = framework::OpRegistry::CreateOp( + "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}}, + attrs); + reshape_op->Run(scope, place); + } +}; + +} // namespace operators +} // namespace paddle + +USE_OP(reshape); + +namespace ops = paddle::operators; +REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker, + ops::FlattenOpInferShape, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape); diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index bda499432214b8841c8dfc406ee45ca0367920e7..3e8f3ec5c5cd683343bcbdfc2388bd37c25e00f9 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -33,19 +33,15 @@ class LookupTableOp : public framework::OperatorWithKernel { auto table_dims = ctx->GetInputDim("W"); auto ids_dims = ctx->GetInputDim("Ids"); - auto ids_var_type = ctx->GetInputsVarType("Ids").front(); - // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type - // is LoDTensor, this tensor contains the ids to be looked up in W - // and it must be a column vector with rank = 2 while the 2nd dimension - // size must be 1, when Ids's type is SelectedRows, the rows of Ids - // contains the ids to be looked up in W; - if (ids_var_type == framework::proto::VarType::LOD_TENSOR) { - PADDLE_ENFORCE_EQ(ids_dims.size(), 2); - PADDLE_ENFORCE_EQ(ids_dims[1], 1); - } + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]}); - ctx->ShareLoD("Ids", /*->*/ "Out"); + + if (ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("Ids", /*->*/ "Out"); + } } protected: @@ -62,17 +58,12 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("W", "(Tensor) The input represents embedding tensors, " "which is a learnable parameter."); - AddInput( - "Ids", - "(Tensor or SelectedRows) Ids's type can be Tensor or " - "SelectedRows, when Ids's type is Tensor, this tensor contains " - "the ids to be looked up in W and it must be a column vector with " - "rank = 2 while the 2nd dimension size must be 1; when Ids's type is " - "SelectedRows, the rows of Ids contains the ids to be looked up " - "in W."); - AddOutput("Out", - "(Tensor or SelectedRows) The lookup results, which have the " - "same type as W."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "Ids must be a column vector with rank = 2. " + "The 2nd dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); AddAttr("is_sparse", "(boolean, default false) " "Sparse update.") @@ -90,15 +81,10 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker { Lookup Table Operator. This operator is used to perform lookups on the parameter W, -then concatenated into a dense or sparse tensor. - -The type of Ids(Input) is SelectedRows, Tensor or LoDTensor, when Ids's -type is SelectedRows, the rows of Ids contains the ids to be looked up in W; -when Ids's type is Tensor, this tensor contains the ids to be looked up in W -and it must be a column vector with rank = 2 while the 2nd dimension size must be 1, -at this time, Ids can carry the LoD (Level of Details) information, or not, and -the output only shares the LoD information with input Ids. +then concatenated into a dense tensor. +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. )DOC"); } diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 77722c50d39003d9342afb04a61ae3aaf6b21100..27483372b93a850d313445386c7973838c4a0710 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -23,7 +23,7 @@ namespace operators { template -__global__ void LookupTable(T* output, const T* table, const int64_t* ids, +__global__ void LookupTable(T *output, const T *table, const int64_t *ids, const int64_t N, const int64_t K, const int64_t D, const int64_t padding_idx) { int idx = threadIdx.x; @@ -33,8 +33,8 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids, int64_t id = ids[idy]; PADDLE_ASSERT(id >= 0); PADDLE_ASSERT(id < N); - T* out = output + idy * D; - const T* tab = table + id * D; + T *out = output + idy * D; + const T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { if (PaddingFlag) { if (id == padding_idx) @@ -50,7 +50,7 @@ __global__ void LookupTable(T* output, const T* table, const int64_t* ids, } template -__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids, +__global__ void LookupTableGrad(T *table, const T *output, const int64_t *ids, const int64_t N, const int64_t K, const int64_t D) { int idx = threadIdx.x; @@ -60,8 +60,8 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids, int id = ids[idy]; PADDLE_ASSERT(id >= 0); PADDLE_ASSERT(id < N); - const T* out = output + idy * D; - T* tab = table + id * D; + const T *out = output + idy * D; + T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { paddle::platform::CudaAtomicAdd(&tab[i], out[i]); } @@ -72,36 +72,19 @@ __global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids, template class LookupTableCUDAKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto* table_t = context.Input("W"); + void Compute(const framework::ExecutionContext &context) const override { + auto *table_t = context.Input("W"); + auto *ids_t = context.Input("Ids"); + auto *output_t = context.Output("Out"); int64_t padding_idx = context.Attr("padding_idx"); - auto* ids_var = context.InputVar("Ids"); - Tensor* output_t = context.Output("Out"); - - int64_t* ids; - int64_t K; - - // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type - // is LoDTensor, this tensor contains the ids to be looked up in W; - // when Ids's type is SelectedRows, the rows of Ids contains the - // ids to be looked up in W. - if (ids_var->IsType()) { - auto* ids_t = context.Input("Ids"); - ids = const_cast(ids_t->data()); - K = ids_t->numel(); - } else if (ids_var->IsType()) { - auto* ids_t = context.Input("Ids"); - ids = const_cast(ids_t->rows().CUDAData(context.GetPlace())); - K = ids_t->rows().size(); - output_t->Resize({K, table_t->dims()[1]}); - } else { - PADDLE_THROW("Unsupported Variable Type of Ids"); - } size_t N = table_t->dims()[0]; size_t D = table_t->dims()[1]; - auto* table = table_t->data(); - auto* output = output_t->mutable_data(context.GetPlace()); + size_t K = ids_t->numel(); + + auto *ids = ids_t->data(); + auto *table = table_t->data(); + auto *output = output_t->mutable_data(context.GetPlace()); dim3 threads(128, 8); dim3 grids(8, 1); @@ -122,19 +105,19 @@ class LookupTableCUDAKernel : public framework::OpKernel { template class LookupTableGradCUDAKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = + void Compute(const framework::ExecutionContext &context) const override { + auto &dev_ctx = context.template device_context(); bool is_sparse = context.Attr("is_sparse"); // Since paddings are not trainable and fixed in forward, the gradient of // paddings makes no sense and we don't deal with it in backward. if (is_sparse) { - auto* ids = context.Input("Ids"); - auto* table = context.Input("W"); - auto* d_output = context.Input(framework::GradVarName("Out")); - auto* d_table = context.Output(framework::GradVarName("W")); + auto *ids = context.Input("Ids"); + auto *table = context.Input("W"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); - auto* ids_data = ids->data(); + auto *ids_data = ids->data(); auto ids_dim = ids->dims(); auto stream = dev_ctx.stream(); @@ -150,12 +133,12 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { d_table->set_rows(new_rows); - auto* d_table_value = d_table->mutable_value(); + auto *d_table_value = d_table->mutable_value(); d_table_value->Resize({ids_dim[0], table->dims()[1]}); d_table_value->mutable_data(context.GetPlace()); - auto* d_table_data = d_table_value->data(); - auto* d_output_data = d_output->data(); + auto *d_table_data = d_table_value->data(); + auto *d_output_data = d_output->data(); PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims()); memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data, d_output->numel() * sizeof(T), stream); @@ -168,9 +151,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { int N = d_table_t->dims()[0]; int D = d_table_t->dims()[1]; int K = ids_t->numel(); - const int64_t* ids = ids_t->data(); - const T* d_output = d_output_t->data(); - T* d_table = d_table_t->mutable_data(context.GetPlace()); + const int64_t *ids = ids_t->data(); + const T *d_output = d_output_t->data(); + T *d_table = d_table_t->mutable_data(context.GetPlace()); auto t = framework::EigenVector::Flatten(*d_table_t); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index d482506bf0361c11a019e32efbf348a64aaf5164..c9f074ca0e8dafb374dc9368165df5af5053a6b8 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -36,43 +36,13 @@ template class LookupTableKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { + auto *ids_t = context.Input("Ids"); // int tensor + auto *output_t = context.Output("Out"); // float tensor auto *table_var = context.InputVar("W"); - auto *ids_var = context.InputVar("Ids"); - Tensor *output_t = context.Output("Out"); - int64_t padding_idx = context.Attr("padding_idx"); - - DDim table_dim; - if (table_var->IsType()) { - table_dim = context.Input("W")->dims(); - } else if (table_var->IsType()) { - auto *table_t = context.Input("W"); - table_dim = table_t->value().dims(); - } else { - PADDLE_THROW( - "The parameter W of a LookupTable " - "must be either LoDTensor or SelectedRows"); - } - - int64_t *ids; - int64_t ids_numel; - - // The type of Ids(Input) is SelectedRows or LoDTensor, when Ids's type - // is LoDTensor, this tensor contains the ids to be looked up in W; - // when Ids's type is SelectedRows, the rows of Ids contains the - // ids to be looked up in W. - if (ids_var->IsType()) { - auto *ids_t = context.Input("Ids"); - ids = const_cast(ids_t->data()); - ids_numel = ids_t->numel(); - } else if (ids_var->IsType()) { - auto *ids_t = context.Input("Ids"); - ids = const_cast(ids_t->rows().data()); - ids_numel = ids_t->rows().size(); - output_t->Resize({ids_numel, table_dim[1]}); - } else { - PADDLE_THROW("Unsupported Variable Type of Ids"); - } + int64_t padding_idx = context.Attr("padding_idx"); + int64_t *ids = const_cast(ids_t->data()); + int64_t ids_numel = ids_t->numel(); if (table_var->IsType()) { auto *table_t = context.Input("W"); diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc index db641a4bc2c637e0babee6b6bc6e67b068759ff5..1172822e12222ded219104e3bad2613d30f891b8 100644 --- a/paddle/fluid/operators/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt_engine_op.cc @@ -163,7 +163,4 @@ REGISTER_OP_CPU_KERNEL( ops::TensorRTEngineKernel, ops::TensorRTEngineKernel); -// A trick to compile with the needed TensorRT op converter. -USE_TRT_CONVERTER(mul) - #endif // PADDLE_WITH_CUDA diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index e0d7937ae2f3ce4bda12f3771727e2992d63cb9b..a6f68f8b0c0a9b07c326888e30c0c911e7861607 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -60,3 +60,7 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) + +IF(WITH_GPU) + nv_test(cuda_helper_test SRCS cuda_helper_test.cu) +ENDIF() diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 77ecb170111d63f23312d06fa8a8172bc45f2a4e..234a04b5c2eb5ee643e8a4e723b28331cd8e6ee0 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_MKLML +#include #include "paddle/fluid/platform/dynload/mklml.h" #endif @@ -33,6 +34,7 @@ void SetNumThreads(int num_threads) { #elif defined(PADDLE_WITH_MKLML) int real_num_threads = num_threads > 1 ? num_threads : 1; platform::dynload::MKL_Set_Num_Threads(real_num_threads); + omp_set_num_threads(num_threads); #else PADDLE_ENFORCE(false, "To be implemented."); #endif diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h index ecec4178f2d9937920e52eb74bf9068b84e741a0..23457ff5fe1ec27094113ba0dde26adc64c716b5 100644 --- a/paddle/fluid/platform/cuda_device_function.h +++ b/paddle/fluid/platform/cuda_device_function.h @@ -14,6 +14,10 @@ limitations under the License. */ #pragma once #include +// NOTE(): support float16 to half in header file. +#define PADDLE_CUDA_FP16 +#include +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace platform { @@ -36,6 +40,18 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val, #endif } +// CUDA 9.0 have native compatible float16 shfl_down +#if CUDA_VERSION < 9000 +template <> +__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask, + float16 val, int delta, + int width) { + half tmp = static_cast(val); + __shfl_down(tmp, static_cast(delta), width); + return float16(tmp); +} +#endif + template __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line, int width = 32) { @@ -46,6 +62,11 @@ __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line, #endif } +template +HOSTDEVICE T Infinity() { + return INFINITY; +} + template __device__ T reduceSum(T val, int tid, int len) { // NOTE(zcd): The warp size should be taken from the diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..4a47ba5ccad4de338844e60f6fcbd6b7c11e891b --- /dev/null +++ b/paddle/fluid/platform/cuda_helper_test.cu @@ -0,0 +1,118 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#define PADDLE_CUDA_FP16 +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/float16.h" + +using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using paddle::platform::float16; + +#define CUDA_ATOMIC_KERNEL(op, T) \ + __global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \ + i += blockDim.x * gridDim.x) { \ + paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \ + } \ + } + +template +struct AddFunctor { + T operator()(const T& a, const T& b) { return a + b; } +}; + +template +struct SubFunctor { + T operator()(const T& a, const T& b) { return a - b; } +}; + +// NOTE(dzhwinter): the float16 add has small underflow/overflow +// so we use EXPECT_NEAR to check the result. +#define ARITHMETIC_KERNEL_LAUNCH(op, T) \ + void Test##T##op(size_t num) { \ + T *in1, *in2, *out; \ + T *d_in1, *d_in2; \ + size_t size = sizeof(T) * num; \ + cudaMalloc(reinterpret_cast(&d_in1), size); \ + cudaMalloc(reinterpret_cast(&d_in2), size); \ + in1 = reinterpret_cast(malloc(size)); \ + in2 = reinterpret_cast(malloc(size)); \ + out = reinterpret_cast(malloc(size)); \ + std::minstd_rand engine; \ + std::uniform_real_distribution dist(0.0, 1.0); \ + for (size_t i = 0; i < num; ++i) { \ + in1[i] = static_cast(dist(engine)); \ + in2[i] = static_cast(dist(engine)); \ + } \ + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ + op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \ + cudaDeviceSynchronize(); \ + cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \ + cudaDeviceSynchronize(); \ + for (size_t i = 0; i < num; ++i) { \ + EXPECT_NEAR(static_cast(out[i]), \ + static_cast(op##Functor()(in1[i], in2[i])), \ + 0.001); \ + } \ + free(in1); \ + free(in2); \ + free(out); \ + cudaFree(d_in1); \ + cudaFree(d_in2); \ + } +CUDA_ATOMIC_KERNEL(Add, float); +CUDA_ATOMIC_KERNEL(Add, double); +CUDA_ATOMIC_KERNEL(Add, float16); + +ARITHMETIC_KERNEL_LAUNCH(Add, float); +ARITHMETIC_KERNEL_LAUNCH(Add, double); +ARITHMETIC_KERNEL_LAUNCH(Add, float16); + +namespace paddle { +namespace platform { +USE_CUDA_ATOMIC(Sub, int); +}; +}; +CUDA_ATOMIC_KERNEL(Sub, int); +ARITHMETIC_KERNEL_LAUNCH(Sub, int); + +// cuda primitives +TEST(CudaAtomic, Add) { + TestfloatAdd(static_cast(10)); + TestfloatAdd(static_cast(1024 * 1024)); + TestdoubleAdd(static_cast(10)); + TestdoubleAdd(static_cast(1024 * 1024)); +} + +TEST(CudaAtomic, Sub) { + TestintSub(static_cast(10)); + TestintSub(static_cast(1024 * 1024)); +} + +TEST(CudaAtomic, float16) { + using paddle::platform::float16; + Testfloat16Add(static_cast(1)); + Testfloat16Add(static_cast(2)); + Testfloat16Add(static_cast(3)); + + Testfloat16Add(static_cast(10)); + Testfloat16Add(static_cast(1024 * 1024)); +} diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h index d535ed2f89df6a0b311ec068ecd92c8e3183cee7..94ce83975a7f13daa2b6a4d480cb22cc95811b9b 100644 --- a/paddle/fluid/platform/cuda_primitives.h +++ b/paddle/fluid/platform/cuda_primitives.h @@ -14,12 +14,14 @@ limitations under the License. */ #pragma once #include +#include +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace platform { #define CUDA_ATOMIC_WRAPPER(op, T) \ - __device__ __forceinline__ T CudaAtomic##op(T* address, const T val) + __device__ __forceinline__ T CudaAtomic##op(T *address, const T val) #define USE_CUDA_ATOMIC(op, T) \ CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); } @@ -42,17 +44,17 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) { static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT "long long should be int64"); return CudaAtomicAdd( - reinterpret_cast(address), // NOLINT - static_cast(val)); // NOLINT + reinterpret_cast(address), // NOLINT + static_cast(val)); // NOLINT } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 USE_CUDA_ATOMIC(Add, double); #else CUDA_ATOMIC_WRAPPER(Add, double) { - unsigned long long int* address_as_ull = // NOLINT - reinterpret_cast(address); // NOLINT - unsigned long long int old = *address_as_ull, assumed; // NOLINT + unsigned long long int *address_as_ull = // NOLINT + reinterpret_cast(address); // NOLINT + unsigned long long int old = *address_as_ull, assumed; // NOLINT do { assumed = old; @@ -64,6 +66,67 @@ CUDA_ATOMIC_WRAPPER(Add, double) { return __longlong_as_double(old); } +#endif + +#ifdef PADDLE_CUDA_FP16 +// NOTE(dzhwinter): cuda do not have atomicCAS for half. +// Just use the half address as a unsigned value address and +// do the atomicCAS. According to the value store at high 16 bits +// or low 16 bits, then do a different sum and CAS. +// Given most warp-threads will failed on the atomicCAS, so this +// implemented should be avoided in high concurrency. It's will be +// slower than the way convert value into 32bits and do a full atomicCAS. + +// convert the value into float and do the add arithmetic. +// then store the result into a uint32. +inline __device__ uint32_t add_to_low_half(uint32_t val, float x) { + float16 low_half; + // the float16 in lower 16bits + low_half.x = static_cast(val & 0xffffu); + low_half = static_cast(static_cast(low_half) + x); + return (val & 0xffff0000u) | low_half.x; +} + +inline __device__ uint32_t add_to_high_half(uint32_t val, float x) { + float16 high_half; + // the float16 in higher 16bits + high_half.x = static_cast(val >> 16); + high_half = static_cast(static_cast(high_half) + x); + return (val & 0xffffu) | (static_cast(high_half.x) << 16); +} + +CUDA_ATOMIC_WRAPPER(Add, float16) { + // concrete packed float16 value may exsits in lower or higher 16bits + // of the 32bits address. + uint32_t *address_as_ui = + reinterpret_cast(reinterpret_cast(address) - + (reinterpret_cast(address) & 2)); + float val_f = static_cast(val); + uint32_t old = *address_as_ui; + uint32_t sum; + uint32_t newval; + uint32_t assumed; + if (((size_t)address & 2) == 0) { + // the float16 value stay at lower 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old & 0xffffu; + return ret; + } else { + // the float16 value stay at higher 16 bits of the address. + do { + assumed = old; + old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f)); + } while (old != assumed); + float16 ret; + ret.x = old >> 16; + return ret; + } +} + #endif } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index ffd183af68514dbb1a8b3de39000c9ca3f56ddc3..efb021c838e3680ab2cdd1c4b298cf7ec2186478 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -67,8 +67,11 @@ struct float16; } // namespace platform } // namespace paddle +// NOTE(): +// Do not move the eigen.h header, otherwise the eigen_vector will failed. #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/platform/hostdevice.h" +#include "unsupported/Eigen/CXX11/Tensor" namespace paddle { namespace platform { @@ -898,6 +901,30 @@ struct is_pod { is_standard_layout::value; }; +template <> +struct is_floating_point + : std::integral_constant< + bool, std::is_same::type>::value> {}; +template <> +struct is_signed { + static const bool value = true; +}; + +template <> +struct is_unsigned { + static const bool value = false; +}; + +inline bool isnan(const paddle::platform::float16& a) { + return paddle::platform::isnan(a); +} + +inline bool isinf(const paddle::platform::float16& a) { + return paddle::platform::isinf(a); +} + template <> struct numeric_limits { static const bool is_specialized = true; diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc index ede294be1e2e26693bd3ead2ccd5e6a6c8a075bc..27e930e6e0a76982b3f27619f38a4a08d82cafa1 100644 --- a/paddle/fluid/platform/float16_test.cc +++ b/paddle/fluid/platform/float16_test.cc @@ -141,10 +141,36 @@ TEST(float16, lod_tensor_cpu) { } } +TEST(float16, floating) { + // compile time assert. + PADDLE_ASSERT(std::is_floating_point::value); +} + TEST(float16, print) { float16 a = float16(1.0f); std::cout << a << std::endl; } +// CPU test +TEST(float16, isinf) { + float16 a; + a.x = 0x7c00; + float16 b = float16(INFINITY); + float16 c = static_cast(INFINITY); + EXPECT_EQ(std::isinf(a), true); + EXPECT_EQ(std::isinf(b), true); + EXPECT_EQ(std::isinf(c), true); +} + +TEST(float16, isnan) { + float16 a; + a.x = 0x7fff; + float16 b = float16(NAN); + float16 c = static_cast(NAN); + EXPECT_EQ(std::isnan(a), true); + EXPECT_EQ(std::isnan(b), true); + EXPECT_EQ(std::isnan(c), true); +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu index 1b9cf9b5d3fa2121b588c31d7cf2f4c50cb951bc..e2b7ca9b03809113c31af8ff4d3ad3713748f330 100644 --- a/paddle/fluid/platform/float16_test.cu +++ b/paddle/fluid/platform/float16_test.cu @@ -11,11 +11,13 @@ limitations under the License. */ #include "paddle/fluid/platform/float16.h" +#include #include +#include +#include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/legacy/utils/Logging.h" #define ARITHMETIC_KERNEL(op_type, sign) \ __global__ void op_type(const half* in1, const half* in2, half* out) { \ @@ -241,6 +243,72 @@ TEST(float16, lod_tensor_on_gpu) { } } +template +struct Functor { + bool operator()(const T& val) { + return std::type_index(typeid(T)) == + std::type_index(typeid(platform::float16)); + } +}; + +TEST(float16, typeid) { + // the framework heavily used typeid hash + Functor functor; + float16 a = float16(.0f); + Functor functor2; + int b(0); + + // compile time assert + PADDLE_ASSERT(functor(a) == true); + PADDLE_ASSERT(functor2(b) == false); +} + +// GPU test +TEST(float16, isinf) { + float16 a; + a.x = 0x7c00; + float16 b = float16(INFINITY); + // underflow to 0 + float16 native_a(5e-40f); + // overflow to inf + float16 native_b(5e40f); + EXPECT_EQ(std::isinf(a), true); + EXPECT_EQ(std::isinf(b), true); + EXPECT_EQ(std::isinf(native_b), true); + EXPECT_EQ(native_a, float16(0)); +} + +TEST(float16, isnan) { + float16 a; + a.x = 0x7fff; + float16 b = float16(NAN); + float16 c = float16(5e40); + // inf * +-0 will get a nan + float16 d = c * float16(0); + EXPECT_EQ(std::isnan(a), true); + EXPECT_EQ(std::isnan(b), true); + EXPECT_EQ(std::isnan(d), true); +} + +TEST(float16, cast) { + float16 a; + a.x = 0x0070; + auto b = a; + { + // change semantic, keep the same value + float16 c = reinterpret_cast(reinterpret_cast(b)); + EXPECT_EQ(b, c); + } + + { + // use uint32 low 16 bit store float16 + uint32_t c = reinterpret_cast(b); + float16 d; + d.x = c; + EXPECT_EQ(b, d); + } +} + } // namespace platform } // namespace paddle #endif // PADDLE_CUDA_FP16 diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 0b776528414735e8a7c1e3763e7ccb662bb9f285..6f1f0c4796f3bae2fb419bf103cb6c0c5489bf65 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -23,6 +23,9 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/piece.h" +DEFINE_int32(paddle_num_threads, 1, + "Number of threads for each paddle instance."); + namespace paddle { namespace framework { @@ -115,7 +118,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); #ifndef PADDLE_WITH_MKLDNN - platform::SetNumThreads(1); + platform::SetNumThreads(FLAGS_paddle_num_threads); #endif } diff --git a/patches/grpc/completion_queue.h b/patches/grpc/completion_queue.h new file mode 100644 index 0000000000000000000000000000000000000000..6e92c60ea2db00cc6e227830228888f9a06735c4 --- /dev/null +++ b/patches/grpc/completion_queue.h @@ -0,0 +1,386 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// A completion queue implements a concurrent producer-consumer queue, with +/// two main API-exposed methods: \a Next and \a AsyncNext. These +/// methods are the essential component of the gRPC C++ asynchronous API. +/// There is also a \a Shutdown method to indicate that a given completion queue +/// will no longer have regular events. This must be called before the +/// completion queue is destroyed. +/// All completion queue APIs are thread-safe and may be used concurrently with +/// any other completion queue API invocation; it is acceptable to have +/// multiple threads calling \a Next or \a AsyncNext on the same or different +/// completion queues, or to call these methods concurrently with a \a Shutdown +/// elsewhere. +/// \remark{All other API calls on completion queue should be completed before +/// a completion queue destructor is called.} +#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H +#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H + +#include + +#include +#include +#include +#include +#include +#include + +struct grpc_completion_queue; + +namespace grpc { + +template +class ClientReader; +template +class ClientWriter; +template +class ClientReaderWriter; +template +class ServerReader; +template +class ServerWriter; +namespace internal { +template +class ServerReaderWriterBody; +} // namespace internal + +class Channel; +class ChannelInterface; +class ClientContext; +class CompletionQueue; +class Server; +class ServerBuilder; +class ServerContext; +class ServerInterface; + +namespace internal { +class CompletionQueueTag; +class RpcMethod; +template +class RpcMethodHandler; +template +class ClientStreamingHandler; +template +class ServerStreamingHandler; +template +class BidiStreamingHandler; +class UnknownMethodHandler; +template +class TemplatedBidiStreamingHandler; +template +class BlockingUnaryCallImpl; +} // namespace internal + +extern CoreCodegenInterface* g_core_codegen_interface; + +/// A thin wrapper around \ref grpc_completion_queue (see \ref +/// src/core/lib/surface/completion_queue.h). +/// See \ref doc/cpp/perf_notes.md for notes on best practices for high +/// performance servers. +class CompletionQueue : private GrpcLibraryCodegen { + public: + /// Default constructor. Implicitly creates a \a grpc_completion_queue + /// instance. + CompletionQueue() + : CompletionQueue(grpc_completion_queue_attributes{ + GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {} + + /// Wrap \a take, taking ownership of the instance. + /// + /// \param take The completion queue instance to wrap. Ownership is taken. + explicit CompletionQueue(grpc_completion_queue* take); + + /// Destructor. Destroys the owned wrapped completion queue / instance. + ~CompletionQueue() { + if (typeid(*g_core_codegen_interface).hash_code() != + typeid(CoreCodegenInterface).hash_code()) { + g_core_codegen_interface->grpc_completion_queue_destroy(cq_); + } + } + + /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT. + enum NextStatus { + SHUTDOWN, ///< The completion queue has been shutdown and fully-drained + GOT_EVENT, ///< Got a new event; \a tag will be filled in with its + ///< associated value; \a ok indicating its success. + TIMEOUT ///< deadline was reached. + }; + + /// Read from the queue, blocking until an event is available or the queue is + /// shutting down. + /// + /// \param tag[out] Updated to point to the read event's tag. + /// \param ok[out] true if read a successful event, false otherwise. + /// + /// Note that each tag sent to the completion queue (through RPC operations + /// or alarms) will be delivered out of the completion queue by a call to + /// Next (or a related method), regardless of whether the operation succeeded + /// or not. Success here means that this operation completed in the normal + /// valid manner. + /// + /// Server-side RPC request: \a ok indicates that the RPC has indeed + /// been started. If it is false, the server has been Shutdown + /// before this particular call got matched to an incoming RPC. + /// + /// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is + /// going to go to the wire. If it is false, it not going to the wire. This + /// would happen if the channel is either permanently broken or + /// transiently broken but with the fail-fast option. (Note that async unary + /// RPCs don't post a CQ tag at this point, nor do client-streaming + /// or bidi-streaming RPCs that have the initial metadata corked option set.) + /// + /// Client-side Write, Client-side WritesDone, Server-side Write, + /// Server-side Finish, Server-side SendInitialMetadata (which is + /// typically included in Write or Finish when not done explicitly): + /// \a ok means that the data/metadata/status/etc is going to go to the + /// wire. If it is false, it not going to the wire because the call + /// is already dead (i.e., canceled, deadline expired, other side + /// dropped the channel, etc). + /// + /// Client-side Read, Server-side Read, Client-side + /// RecvInitialMetadata (which is typically included in Read if not + /// done explicitly): \a ok indicates whether there is a valid message + /// that got read. If not, you know that there are certainly no more + /// messages that can ever be read from this stream. For the client-side + /// operations, this only happens because the call is dead. For the + /// server-sider operation, though, this could happen because the client + /// has done a WritesDone already. + /// + /// Client-side Finish: \a ok should always be true + /// + /// Server-side AsyncNotifyWhenDone: \a ok should always be true + /// + /// Alarm: \a ok is true if it expired, false if it was canceled + /// + /// \return true if got an event, false if the queue is fully drained and + /// shut down. + bool Next(void** tag, bool* ok) { + return (AsyncNextInternal(tag, + ok, + g_core_codegen_interface->gpr_inf_future( + GPR_CLOCK_REALTIME)) != SHUTDOWN); + } + + /// Read from the queue, blocking up to \a deadline (or the queue's shutdown). + /// Both \a tag and \a ok are updated upon success (if an event is available + /// within the \a deadline). A \a tag points to an arbitrary location usually + /// employed to uniquely identify an event. + /// + /// \param tag[out] Upon sucess, updated to point to the event's tag. + /// \param ok[out] Upon sucess, true if a successful event, false otherwise + /// See documentation for CompletionQueue::Next for explanation of ok + /// \param deadline[in] How long to block in wait for an event. + /// + /// \return The type of event read. + template + NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) { + TimePoint deadline_tp(deadline); + return AsyncNextInternal(tag, ok, deadline_tp.raw_time()); + } + + /// EXPERIMENTAL + /// First executes \a F, then reads from the queue, blocking up to + /// \a deadline (or the queue's shutdown). + /// Both \a tag and \a ok are updated upon success (if an event is available + /// within the \a deadline). A \a tag points to an arbitrary location usually + /// employed to uniquely identify an event. + /// + /// \param F[in] Function to execute before calling AsyncNext on this queue. + /// \param tag[out] Upon sucess, updated to point to the event's tag. + /// \param ok[out] Upon sucess, true if read a regular event, false otherwise. + /// \param deadline[in] How long to block in wait for an event. + /// + /// \return The type of event read. + template + NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) { + CompletionQueueTLSCache cache = CompletionQueueTLSCache(this); + f(); + if (cache.Flush(tag, ok)) { + return GOT_EVENT; + } else { + return AsyncNext(tag, ok, deadline); + } + } + + /// Request the shutdown of the queue. + /// + /// \warning This method must be called at some point if this completion queue + /// is accessed with Next or AsyncNext. \a Next will not return false + /// until this method has been called and all pending tags have been drained. + /// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .) + /// Only once either one of these methods does that (that is, once the queue + /// has been \em drained) can an instance of this class be destroyed. + /// Also note that applications must ensure that no work is enqueued on this + /// completion queue after this method is called. + void Shutdown(); + + /// Returns a \em raw pointer to the underlying \a grpc_completion_queue + /// instance. + /// + /// \warning Remember that the returned instance is owned. No transfer of + /// owership is performed. + grpc_completion_queue* cq() { return cq_; } + + protected: + /// Private constructor of CompletionQueue only visible to friend classes + CompletionQueue(const grpc_completion_queue_attributes& attributes) { + cq_ = g_core_codegen_interface->grpc_completion_queue_create( + g_core_codegen_interface->grpc_completion_queue_factory_lookup( + &attributes), + &attributes, + NULL); + InitialAvalanching(); // reserve this for the future shutdown + } + + private: + // Friend synchronous wrappers so that they can access Pluck(), which is + // a semi-private API geared towards the synchronous implementation. + template + friend class ::grpc::ClientReader; + template + friend class ::grpc::ClientWriter; + template + friend class ::grpc::ClientReaderWriter; + template + friend class ::grpc::ServerReader; + template + friend class ::grpc::ServerWriter; + template + friend class ::grpc::internal::ServerReaderWriterBody; + template + friend class ::grpc::internal::RpcMethodHandler; + template + friend class ::grpc::internal::ClientStreamingHandler; + template + friend class ::grpc::internal::ServerStreamingHandler; + template + friend class ::grpc::internal::TemplatedBidiStreamingHandler; + friend class ::grpc::internal::UnknownMethodHandler; + friend class ::grpc::Server; + friend class ::grpc::ServerContext; + friend class ::grpc::ServerInterface; + template + friend class ::grpc::internal::BlockingUnaryCallImpl; + + /// EXPERIMENTAL + /// Creates a Thread Local cache to store the first event + /// On this completion queue queued from this thread. Once + /// initialized, it must be flushed on the same thread. + class CompletionQueueTLSCache { + public: + CompletionQueueTLSCache(CompletionQueue* cq); + ~CompletionQueueTLSCache(); + bool Flush(void** tag, bool* ok); + + private: + CompletionQueue* cq_; + bool flushed_; + }; + + NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline); + + /// Wraps \a grpc_completion_queue_pluck. + /// \warning Must not be mixed with calls to \a Next. + bool Pluck(internal::CompletionQueueTag* tag) { + auto deadline = + g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME); + auto ev = g_core_codegen_interface->grpc_completion_queue_pluck( + cq_, tag, deadline, nullptr); + bool ok = ev.success != 0; + void* ignored = tag; + GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok)); + GPR_CODEGEN_ASSERT(ignored == tag); + // Ignore mutations by FinalizeResult: Pluck returns the C API status + return ev.success != 0; + } + + /// Performs a single polling pluck on \a tag. + /// \warning Must not be mixed with calls to \a Next. + /// + /// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already + /// shutdown. This is most likely a bug and if it is a bug, then change this + /// implementation to simple call the other TryPluck function with a zero + /// timeout. i.e: + /// TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME)) + void TryPluck(internal::CompletionQueueTag* tag) { + auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME); + auto ev = g_core_codegen_interface->grpc_completion_queue_pluck( + cq_, tag, deadline, nullptr); + if (ev.type == GRPC_QUEUE_TIMEOUT) return; + bool ok = ev.success != 0; + void* ignored = tag; + // the tag must be swallowed if using TryPluck + GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok)); + } + + /// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if + /// the pluck() was successful and returned the tag. + /// + /// This exects tag->FinalizeResult (if called) to return 'false' i.e expects + /// that the tag is internal not something that is returned to the user. + void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) { + auto ev = g_core_codegen_interface->grpc_completion_queue_pluck( + cq_, tag, deadline, nullptr); + if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) { + return; + } + + bool ok = ev.success != 0; + void* ignored = tag; + GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok)); + } + + /// Manage state of avalanching operations : completion queue tags that + /// trigger other completion queue operations. The underlying core completion + /// queue should not really shutdown until all avalanching operations have + /// been finalized. Note that we maintain the requirement that an avalanche + /// registration must take place before CQ shutdown (which must be maintained + /// elsehwere) + void InitialAvalanching() { + gpr_atm_rel_store(&avalanches_in_flight_, static_cast(1)); + } + void RegisterAvalanching() { + gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_, + static_cast(1)); + } + void CompleteAvalanching(); + + grpc_completion_queue* cq_; // owned + + gpr_atm avalanches_in_flight_; +}; + +/// A specific type of completion queue used by the processing of notifications +/// by servers. Instantiated by \a ServerBuilder. +class ServerCompletionQueue : public CompletionQueue { + public: + bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; } + + private: + grpc_cq_polling_type polling_type_; + friend class ServerBuilder; + /// \param is_frequently_polled Informs the GRPC library about whether the + /// server completion queue would be actively polled (by calling Next() or + /// AsyncNext()). By default all server completion queues are assumed to be + /// frequently polled. + ServerCompletionQueue(grpc_cq_polling_type polling_type) + : CompletionQueue(grpc_completion_queue_attributes{ + GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}), + polling_type_(polling_type) {} +}; + +} // namespace grpc + +#endif // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H diff --git a/patches/grpc/fix_too_early_destory.patch b/patches/grpc/fix_too_early_destory.patch deleted file mode 100644 index d7790d56b07551b8daae9b9a41be5432e5b8b9cc..0000000000000000000000000000000000000000 --- a/patches/grpc/fix_too_early_destory.patch +++ /dev/null @@ -1,47 +0,0 @@ -diff --git a/include/grpcpp/impl/codegen/completion_queue.h b/include/grpcpp/impl/codegen/completion_queue.h -index 80c7c41982..3f7d8a7714 100644 ---- a/include/grpcpp/impl/codegen/completion_queue.h -+++ b/include/grpcpp/impl/codegen/completion_queue.h -@@ -32,6 +32,8 @@ - #ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H - #define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H - -+#include -+ - #include - #include - #include -@@ -106,7 +108,9 @@ class CompletionQueue : private GrpcLibraryCodegen { - - /// Destructor. Destroys the owned wrapped completion queue / instance. - ~CompletionQueue() { -- g_core_codegen_interface->grpc_completion_queue_destroy(cq_); -+ if (typeid(*g_core_codegen_interface).hash_code() != typeid(CoreCodegenInterface).hash_code()) { -+ g_core_codegen_interface->grpc_completion_queue_destroy(cq_); -+ } - } - - /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT. -diff --git a/include/grpcpp/impl/codegen/grpc_library.h b/include/grpcpp/impl/codegen/grpc_library.h -index 17c904d71a..a092b2204d 100644 ---- a/include/grpcpp/impl/codegen/grpc_library.h -+++ b/include/grpcpp/impl/codegen/grpc_library.h -@@ -19,6 +19,8 @@ - #ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H - #define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H - -+#include -+ - #include - - namespace grpc { -@@ -47,7 +49,8 @@ class GrpcLibraryCodegen { - } - } - virtual ~GrpcLibraryCodegen() { -- if (grpc_init_called_) { -+ if (grpc_init_called_ && -+ typeid(*g_glip).hash_code() != typeid(GrpcLibraryInterface).hash_code()) { - GPR_CODEGEN_ASSERT(g_glip && - "gRPC library not initialized. See " - "grpc::internal::GrpcLibraryInitializer."); diff --git a/patches/grpc/grpc_library.h b/patches/grpc/grpc_library.h new file mode 100644 index 0000000000000000000000000000000000000000..4870a1cda4b2a6489bc379fe53cf3e9659fffc47 --- /dev/null +++ b/patches/grpc/grpc_library.h @@ -0,0 +1,64 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H +#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H + +#include + +#include + +namespace grpc { + +class GrpcLibraryInterface { + public: + virtual ~GrpcLibraryInterface() = default; + virtual void init() = 0; + virtual void shutdown() = 0; +}; + +/// Initialized by \a grpc::GrpcLibraryInitializer from +/// +extern GrpcLibraryInterface* g_glip; + +/// Classes that require gRPC to be initialized should inherit from this class. +class GrpcLibraryCodegen { + public: + GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) { + if (call_grpc_init) { + GPR_CODEGEN_ASSERT(g_glip && + "gRPC library not initialized. See " + "grpc::internal::GrpcLibraryInitializer."); + g_glip->init(); + grpc_init_called_ = true; + } + } + virtual ~GrpcLibraryCodegen() { + if (grpc_init_called_ && + typeid(*g_glip).hash_code() != + typeid(GrpcLibraryInterface).hash_code()) { + GPR_CODEGEN_ASSERT(g_glip && + "gRPC library not initialized. See " + "grpc::internal::GrpcLibraryInitializer."); + g_glip->shutdown(); + } + } + + private: + bool grpc_init_called_; +}; + +} // namespace grpc + +#endif // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 5ad3c5e7f7024e32609982abb64c53b481d0e655..447abfac59648efd51d5a2b0cf0aa9bcc5319f3f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -123,7 +123,7 @@ def __bootstrap__(): read_env_flags = [ 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir', 'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory' + 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads' ] if core.is_compiled_with_dist(): read_env_flags.append('rpc_deadline') diff --git a/python/paddle/fluid/regularizer.py b/python/paddle/fluid/regularizer.py index 0d0288a176adf0ccc43780f26b2762901de73612..6eaac4432d4df1288f37607a01484434542f1138 100644 --- a/python/paddle/fluid/regularizer.py +++ b/python/paddle/fluid/regularizer.py @@ -142,14 +142,20 @@ class L2DecayRegularizer(WeightDecayRegularizer): dtype="float32", shape=param.shape, lod_level=param.lod_level) if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + idx = block.create_var( + dtype="int64", + shape=param.shape, + type=core.VarDesc.VarType.LOD_TENSOR) decay = block.create_var( dtype="float32", shape=param.shape, type=core.VarDesc.VarType.SELECTED_ROWS) + block.append_op( + type='extract_rows', inputs={'X': grad}, outputs={'Out': idx}) block.append_op( type='lookup_table', inputs={'W': param, - 'Ids': grad}, + 'Ids': idx}, outputs={'Out': decay}, attrs={'is_sparse': True}) param = decay @@ -216,14 +222,20 @@ class L1DecayRegularizer(WeightDecayRegularizer): dtype="float32", shape=param.shape, lod_level=param.lod_level) if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + idx = block.create_var( + dtype="int64", + shape=param.shape, + type=core.VarDesc.VarType.LOD_TENSOR) decay = block.create_var( dtype="float32", shape=param.shape, type=core.VarDesc.VarType.SELECTED_ROWS) + block.append_op( + type='extract_rows', inputs={'X': grad}, outputs={'Out': idx}) block.append_op( type='lookup_table', inputs={'W': param, - 'Ids': grad}, + 'Ids': idx}, outputs={'Out': decay}, attrs={'is_sparse': True}) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 322d76515e76c3d322ac7c4f989bbc95875cb654..43f68ff4592df6757691b06db52cf5e0e2ebc6d7 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -40,7 +40,7 @@ function(py_test_modules TARGET_NAME) ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (py_test_modules_SERIAL) - set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1) + set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) endif() endif() endfunction() diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py index 3cffdebf0693520e46998e399b3d05471061e599..7199ef1020eb4e50bdcfb1cf5027ad9ec0f8ebaa 100644 --- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py @@ -278,7 +278,7 @@ class DistSeResneXt2x2: def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True): test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model( - batch_size=20) + batch_size=2) if is_dist: t = get_transpiler(trainer_id, fluid.default_main_program(), endpoints, @@ -294,11 +294,7 @@ class DistSeResneXt2x2: strategy.num_threads = 1 strategy.allow_op_delay = False exe = fluid.ParallelExecutor( - True, - loss_name=avg_cost.name, - exec_strategy=strategy, - num_trainers=trainers, - trainer_id=trainer_id) + True, loss_name=avg_cost.name, exec_strategy=strategy) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py index e3e7036f08cb88087ae45fe7d7c7565c102dab8a..3b67b3f5ccd67f86f87f292d83a6039ff46260bd 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py +++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py @@ -19,6 +19,7 @@ import math import unittest import os +import sys import signal import subprocess @@ -56,7 +57,7 @@ class TestDistSeResneXt2x2(unittest.TestCase): except os.error: retry_times -= 1 - def non_test_with_place(self): + def test_with_place(self): # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN required_envs = { "PATH": os.getenv("PATH"), @@ -70,9 +71,15 @@ class TestDistSeResneXt2x2(unittest.TestCase): local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \ (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1) local_proc = subprocess.Popen( - local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local) + local_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env_local) local_proc.wait() - local_ret = local_proc.stdout.read() + out, err = local_proc.communicate() + local_ret = out + sys.stderr.write('local_loss: %s\n' % local_ret) + sys.stderr.write('local_stderr: %s\n' % err) # Run dist train to compare with local results ps0, ps1 = self.start_pserver() @@ -92,13 +99,22 @@ class TestDistSeResneXt2x2(unittest.TestCase): FNULL = open(os.devnull, 'w') tr0_proc = subprocess.Popen( - tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0) + tr0_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env0) tr1_proc = subprocess.Popen( - tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1) + tr1_cmd.split(" "), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + env=env1) tr0_proc.wait() tr1_proc.wait() - loss_data0 = tr0_proc.stdout.read() + out, err = tr0_proc.communicate() + sys.stderr.write('dist_stderr: %s\n' % err) + loss_data0 = out + sys.stderr.write('dist_loss: %s\n' % loss_data0) lines = loss_data0.split("\n") dist_first_loss = eval(lines[0].replace(" ", ","))[0] dist_last_loss = eval(lines[1].replace(" ", ","))[0] diff --git a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py new file mode 100644 index 0000000000000000000000000000000000000000..6a41c44fe655b18626bdb727745dae032babe8ad --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from op_test import OpTest + + +class TestExtractRows(OpTest): + def check_with_place(self, place): + scope = core.Scope() + + # create and initialize Variable + feature_len = 12 + rows = [0, 4, 4, 7] + np_array = np.ones((len(rows), feature_len)).astype("float32") + + in_x = scope.var('X').get_selected_rows() + in_x.set_height(len(rows)) + in_x.set_rows(rows) + in_x_tensor = in_x.get_tensor() + in_x_tensor.set(np_array, place) + + # create Out Variable + out_tensor = scope.var('Out').get_tensor() + + # create and run lookup_table operator + extract_rows_op = Operator("extract_rows", X='X', Out='Out') + extract_rows_op.run(scope, place) + + # get result from Out + result_array = np.array(out_tensor) + result_array = [ele[0] for ele in result_array] + assert result_array == rows + + def test_concat_rows(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f8692ce2ea66ef61c63bc41e77df050398ac63fd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py @@ -0,0 +1,68 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +from op_test import OpTest + + +class TestFlattenOp(OpTest): + def setUp(self): + self.op_type = "flatten" + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} + self.init_attrs() + self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 2, 5) + self.axis = 1 + self.new_shape = (3, 20) + + def init_attrs(self): + self.attrs = {"axis": self.axis} + + +class TestFlattenOp(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 2, 3) + self.axis = 0 + self.new_shape = (1, 36) + + +class TestFlattenOpWithDefaultAxis(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 2, 3) + self.new_shape = (3, 12) + + def init_attrs(self): + self.attrs = {} + + +class TestFlattenOpSixDims(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py index 487a7b74357ec2730f8300d4b0635280406fe9bb..fcf3a0d920d8a3b903b64cecccd1915566843e2b 100644 --- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py +++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py @@ -49,53 +49,6 @@ class TestLookupTableOpWithPadding(TestLookupTableOp): pass -class TestLookupTableIdsIsSelectedRows(OpTest): - def check_with_place(self, place): - scope = core.Scope() - - # create and initialize Variable - height = 10 - rows = [0, 4, 4, 7] - row_numel = 12 - - # create and initialize W Variable - W = scope.var('W').get_tensor() - W_array = np.full((height, row_numel), 1.0).astype("float32") - for i in range(height): - W_array[i] *= i - W.set(W_array, place) - - # create and initialize Ids Variable - ids_selected_rows = scope.var('Ids').get_selected_rows() - ids_selected_rows.set_height(len(rows)) - ids_selected_rows.set_rows(rows) - np_array = np.ones((len(rows), row_numel)).astype("float32") - ids_tensor = ids_selected_rows.get_tensor() - ids_tensor.set(np_array, place) - - # create Out Variable - Out = scope.var('Out').get_selected_rows() - - # create and run lookup_table operator - concat_rows_op = Operator("lookup_table", W='W', Ids='Ids', Out='Out') - concat_rows_op.run(scope, place) - - # get result from Out - Out_tensor = Out.get_tensor() - result_array = np.array(Out_tensor) - - # all(): return True if all elements of the iterable are true (or if the iterable is empty) - for idx, row in enumerate(rows): - assert (row == result_array[idx]).all() - - def test_concat_rows(self): - places = [core.CPUPlace()] - if core.is_compiled_with_cuda(): - places.append(core.CUDAPlace(0)) - for place in places: - self.check_with_place(place) - - class TestLookupTableWIsSelectedRows(OpTest): def check_with_place(self, place): scope = core.Scope() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 75c403163d60d1c798fb04e301d068eb97089738..a28cb79e622fcf408ed2058e723497681cb5ef5f 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -346,6 +346,7 @@ class DistributeTranspiler(object): # step1 pserver_program = Program() + pserver_program.random_seed = self.origin_program.random_seed # step2: Create vars to receive vars at parameter servers. recv_inputs = [] for v in self.param_grad_ep_mapping[endpoint]["params"]: @@ -543,6 +544,7 @@ class DistributeTranspiler(object): """ s_prog = Program() orig_s_prog = default_startup_program() + s_prog.random_seed = orig_s_prog.random_seed params = self.param_grad_ep_mapping[endpoint]["params"] def _get_splited_name_and_shape(varname): diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook index 2c65222c8aa7a019f0f8fea68fe02612f70bd41f..aa14d3a2a12208eda11e82d88bc582eb3d2f5893 100755 --- a/tools/codestyle/cpplint_pre_commit.hook +++ b/tools/codestyle/cpplint_pre_commit.hook @@ -4,7 +4,7 @@ TOTAL_ERRORS=0 # The trick to remove deleted files: https://stackoverflow.com/a/2413151 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do - if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then + if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then continue; else cpplint --filter=-readability/fn_size $file;