diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index c75ea537216f3524ffec53fd199986021a096e49..020dbad53076d20a73f8aeadd8c20ca9157982a1 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -40,6 +40,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel { platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); + // add copy ids to ensure ids_t is prepared. std::vector ids; TensorToVector(*ids_t, ctx.device_context(), &ids); diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index efe747408428a68772726c28af469b975836511e..1a4fe21afa577fa2e6a1f82528f45acdd977d6b7 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -193,6 +193,12 @@ def init_parallel_env(): elif core.is_compiled_with_xpu(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) + + other_endpoints = strategy.trainer_endpoints[:] + other_endpoints.remove(strategy.current_endpoint) + if strategy.local_rank == 0: + wait_server_ready(other_endpoints) + parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init)