diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc index 7d005c9690b9486ff8c693d9c14f83853a016ced..f447a00f37c808bafe99b54af4984af9c2af1cfe 100644 --- a/paddle/fluid/framework/data_device_transform.cc +++ b/paddle/fluid/framework/data_device_transform.cc @@ -26,6 +26,13 @@ void TransDataDevice(const Tensor &in, const platform::Place &dst_place, platform::errors::Unavailable("Currently, model parallelism is only " "supported between CPU and CUDA.")); + // NOTE(zhiqiu): Special case for CPU->NPU, avoid stream sync. + if (platform::is_cpu_place(in.place()) && platform::is_npu_place(dst_place)) { + TensorCopy(in, dst_place, + *platform::DeviceContextPool::Instance().Get(dst_place), out); + return; + } + // NOTE(yy): TransDataDevice should wait for computation of input. if (!platform::is_cuda_pinned_place(in.place())) { platform::DeviceContextPool::Instance().Get(in.place())->Wait();