From 2c945737c318a58f0c29c897d0f8739a03d90228 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Fri, 16 Jul 2021 20:18:16 +0800 Subject: [PATCH] Add wait_server_ready for dygraph parallel (#34207) * add wait_server_ready * fix remove bug --- paddle/fluid/operators/lookup_table_v2_op_npu.cc | 1 + python/paddle/distributed/parallel.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index c75ea537216..020dbad5307 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -40,6 +40,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel { platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); + // add copy ids to ensure ids_t is prepared. std::vector ids; TensorToVector(*ids_t, ctx.device_context(), &ids); diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index efe74740842..1a4fe21afa5 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -193,6 +193,12 @@ def init_parallel_env(): elif core.is_compiled_with_xpu(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) + + other_endpoints = strategy.trainer_endpoints[:] + other_endpoints.remove(strategy.current_endpoint) + if strategy.local_rank == 0: + wait_server_ready(other_endpoints) + parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init) -- GitLab