diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d2616da7a127da8c5e7b204c5216d31ad8933d97..15021b6267b65604e73abefbd7d8f683942218e7 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -25,7 +25,7 @@ limitations under the License. */ #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_MKLDNN -#include "dnnl_debug.h" +#include "dnnl_debug.h" // NOLINT #endif namespace paddle { @@ -112,11 +112,32 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, } else if (platform::is_cpu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size, - stream); + // 1. cpu tensor -> npu pinned tensor + platform::NPUPinnedPlace npu_pinned_place; + Tensor npu_pinned_tensor; + npu_pinned_tensor.Resize(src.dims()); + auto npu_pinned_ptr = + npu_pinned_tensor.mutable_data(npu_pinned_place, src.type()); + memory::Copy(npu_pinned_place, npu_pinned_ptr, + BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size); + + // 2. async copy npu pinned tensor -> npu tensor + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + npu_pinned_place, npu_pinned_ptr, size, + reinterpret_cast(ctx).stream()); + + // 3. record event + auto npu_pinned_allocator = + static_cast( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(npu_pinned_place) + .get()); + paddle::memory::allocation::Allocation* allocation = + npu_pinned_tensor.Holder().get(); + npu_pinned_allocator->RecordEvent( + allocation, + reinterpret_cast(ctx).stream()); } else if (platform::is_npu_place(src_place) && // NOLINT platform::is_npu_place(dst_place)) { diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 020dbad53076d20a73f8aeadd8c20ca9157982a1..2a8f47462345188c3870ca07119fe7687a1ebe9f 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -40,10 +40,6 @@ class LookupTableV2NPUKernel : public framework::OpKernel { platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); - // add copy ids to ensure ids_t is prepared. - std::vector ids; - TensorToVector(*ids_t, ctx.device_context(), &ids); - NpuOpRunner runner; runner.SetType("GatherV2") .AddInput(*table_t)