Add wait_server_ready for dygraph parallel (#34207)

* add wait_server_ready * fix remove bug

Add wait_server_ready for dygraph parallel (#34207)
* add wait_server_ready * fix remove bug
2c945737 · ShenLiang · GitHub · 8b59f5e0 · 2c945737 · 2c945737
隐藏空白更改
内联并排

Showing with 7 addition and 0 deletion

paddle/fluid/operators/lookup_table_v2_op_npu.cc paddle/fluid/operators/lookup_table_v2_op_npu.cc +1 -0

python/paddle/distributed/parallel.py python/paddle/distributed/parallel.py +6 -0

未找到文件。
--- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc
@@ -40,6 +40,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
        platform::errors::InvalidArgument("npu only accept LoDTensor"));
    output_t->mutable_data<T>(ctx.GetPlace());
+    // add copy ids to ensure ids_t is prepared.
    std::vector<int> ids;
    TensorToVector(*ids_t, ctx.device_context(), &ids);

--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -193,6 +193,12 @@ def init_parallel_env():
    elif core.is_compiled_with_xpu():
        parallel_helper._set_parallel_ctx(
            core.BKCLParallelContext(strategy, place))
+    other_endpoints = strategy.trainer_endpoints[:]
+    other_endpoints.remove(strategy.current_endpoint)
+    if strategy.local_rank == 0:
+        wait_server_ready(other_endpoints)
    parallel_helper._init_parallel_ctx()
    # 5: init gloo context (step 2: gloo init)