diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 7ab17de3896792194ac6d6ca6800ee1741d01ae1..9e279876cfeef20a1921f8bd1c27046a477b9f56 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -42,7 +42,7 @@ class ParallelExecutor { const std::vector& local_scopes, bool allow_op_delay, bool use_default_grad_scale, bool balance_parameter_opt_between_cards, - size_t num_trainers = 0, size_t trainer_id = 0); + size_t num_trainers = 1, size_t trainer_id = 0); ~ParallelExecutor(); diff --git a/paddle/fluid/operators/gen_nccl_id_op.cc b/paddle/fluid/operators/gen_nccl_id_op.cc index b4ff6b75988b4d61d3e4c17e6871b8e01ec87d1c..36fc862213fe997aaaaff8ee3ec4ca301a0e9a48 100644 --- a/paddle/fluid/operators/gen_nccl_id_op.cc +++ b/paddle/fluid/operators/gen_nccl_id_op.cc @@ -75,29 +75,29 @@ class GenNCCLIdOp : public framework::OperatorBase { // NOTE: Can not use unique_ptr here because the default // deleter will call GRPC Server's base class's dtor and // that will cause a wired crash. - rpc_service_ = new detail::AsyncGRPCServer(endpoint, true); + + detail::AsyncGRPCServer rpc_service(endpoint, true); framework::ProgramDesc empty_program; framework::Executor executor(dev_ctx.GetPlace()); - rpc_service_->SetScope(scope); - rpc_service_->SetDevCtx(&dev_ctx); - rpc_service_->SetProgram(&empty_program); - rpc_service_->SetExecutor(&executor); + rpc_service.SetScope(scope); + rpc_service.SetDevCtx(&dev_ctx); + rpc_service.SetProgram(&empty_program); + rpc_service.SetExecutor(&executor); std::thread server_thread( - std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, rpc_service_)); - rpc_service_->SetCond(0); + std::bind(&detail::AsyncGRPCServer::RunSyncUpdate, &rpc_service)); + rpc_service.SetCond(0); VLOG(3) << "start getting nccl id from trainer 0..."; - auto recv = rpc_service_->Get(); + auto recv = rpc_service.Get(); VLOG(3) << "got nccl id and stop server..."; - rpc_service_->ShutDown(); + rpc_service.ShutDown(); VLOG(3) << "rpc server stopped"; // TODO(wuyi): reinit nccl communicators server_thread.join(); - delete rpc_service_; } - protected: - mutable detail::AsyncGRPCServer* rpc_service_ = nullptr; + // protected: + // mutable detail::AsyncGRPCServer* rpc_service_ = nullptr; }; class GenNCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 408721be8b118f11a77287cbb3aca2af8bd81cec..e30c1a9ebf08365a9856fb32b1ce5790869e2b33 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -78,7 +78,7 @@ struct NCCLContextMap { explicit NCCLContextMap(const std::vector &places, ncclUniqueId *nccl_id = nullptr, - size_t num_trainers = 0, size_t trainer_id = 0) { + size_t num_trainers = 1, size_t trainer_id = 0) { PADDLE_ENFORCE(!places.empty()); order_.reserve(places.size()); for (auto &p : places) { @@ -100,7 +100,7 @@ struct NCCLContextMap { PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); } else { - PADDLE_ENFORCE_GT(num_trainers, 0); + PADDLE_ENFORCE_GT(num_trainers, 1); // TODO(wuyi): need to ensure each node have same number of GPUs { int nranks = num_trainers * order_.size(); diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 59294d972116a624bed60b0c766df1e00f02acdc..7358c4b60e87893b9c04e3da2221dfb69d3ba0c7 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -32,7 +32,7 @@ class ParallelExecutor(object): share_vars_from=None, use_default_grad_scale=True, balance_parameter_opt_between_cards=False, - num_trainers=0, + num_trainers=1, trainer_id=0): """ ParallelExecutor can run program in parallel. @@ -57,7 +57,7 @@ class ParallelExecutor(object): balance_parameter_opt_between_cards(bool, default True): Whether updating different gradients on different cards. Currently, it is not recommended. - num_trainers(int, default 0): If greater than 0, NCCL will be + num_trainers(int, default 1): If greater than 1, NCCL will be initialized with multpile rank of nodes, each node should have same number of GPUs. Distributed training will be enabled then. trainer_id(int, default 0): Must use together with num_trainers.