diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 3b52587a286a3f4a8cac23ec1c3322442e290ab2..f3c4c92afaa44e9285674976712c58053203e8b3 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -108,8 +108,8 @@ struct NCCLContextMap { for (auto &gpu_id : order_) { int rank = trainer_id * order_.size() + gpu_id; PADDLE_ENFORCE(cudaSetDevice(gpu_id)); - PADDLE_ENFORCE( - ncclCommInitRank(comms.get() + gpu_id, nranks, *nccl_id, rank)); + PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( + comms.get() + gpu_id, nranks, *nccl_id, rank)); } } } diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index f4128dcbe935ea5caf63b8c0b377f9bed36d6705..34899a54b619e3cd6914a391ae2f7dee77ec07ca 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -30,7 +30,9 @@ class ParallelExecutor(object): num_threads=None, allow_op_delay=False, share_vars_from=None, - use_default_grad_scale=True): + use_default_grad_scale=True, + num_nodes=0, + trainer_id=0): """ ParallelExecutor can run program in parallel. @@ -129,7 +131,9 @@ class ParallelExecutor(object): scope, local_scopes, allow_op_delay, - use_default_grad_scale) + use_default_grad_scale, + num_nodes, + trainer_id) self.scope = scope def run(self, fetch_list, feed=None, feed_dict=None):