diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 2a8f47462345188c3870ca07119fe7687a1ebe9f..c75ea537216f3524ffec53fd199986021a096e49 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -40,6 +40,9 @@ class LookupTableV2NPUKernel : public framework::OpKernel { platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); + std::vector ids; + TensorToVector(*ids_t, ctx.device_context(), &ids); + NpuOpRunner runner; runner.SetType("GatherV2") .AddInput(*table_t) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index a74f923dea40236e7111e5a8d027be3a335d835f..878614ca1526e939fcfe9a34735384bdccf25c3a 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -374,8 +374,8 @@ class ShardingOptimizer(MetaOptimizerBase): 'w') as f: f.writelines(str(main_block.program)) - if core.is_compiled_with_cuda(): - self._wait() + # GPU and NPU need to wait server ready + self._wait() return optimize_ops, params_grads def _init_comm(self):