未验证 提交 8b59f5e0 编写于 作者: B Baibaifan 提交者: GitHub

mode_npu_gather_v2 (#34194)

上级 fb55e00e
......@@ -40,6 +40,9 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
platform::errors::InvalidArgument("npu only accept LoDTensor"));
output_t->mutable_data<T>(ctx.GetPlace());
std::vector<int> ids;
TensorToVector(*ids_t, ctx.device_context(), &ids);
NpuOpRunner runner;
runner.SetType("GatherV2")
.AddInput(*table_t)
......
......@@ -374,8 +374,8 @@ class ShardingOptimizer(MetaOptimizerBase):
'w') as f:
f.writelines(str(main_block.program))
if core.is_compiled_with_cuda():
self._wait()
# GPU and NPU need to wait server ready
self._wait()
return optimize_ops, params_grads
def _init_comm(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册