diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 22c8e1c1665f121cda6ba33f23cb7fc0749da025..15c478e531e9c756bdb4296bbc64e65aab331828 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -19,6 +19,10 @@ limitations under the License. */ #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" +#endif #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -166,8 +170,30 @@ void TensorFromVector(const std::vector& src, // Since vector is on cpu, I think this function should be a "sync" operation, // so pass nullptr as stream to memory::Copy(). else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - src_place, src_ptr, size, nullptr); + // 1. vector -> npu pinned tensor + Tensor npu_pinned_tensor(dst->type()); + platform::NPUPinnedPlace npu_pinned_place; + auto npu_pinned_ptr = + npu_pinned_tensor.mutable_data(dst->dims(), npu_pinned_place); + memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); + + // 2. async copy npu pinned tensor -> npu tensor + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + npu_pinned_place, npu_pinned_ptr, size, + reinterpret_cast(ctx).stream()); + + // 3. record event + auto npu_pinned_allocator = + static_cast( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(npu_pinned_place) + .get()); + paddle::memory::allocation::Allocation* allocation = + npu_pinned_tensor.Holder().get(); + npu_pinned_allocator->RecordEvent( + allocation, + reinterpret_cast(ctx).stream()); } #endif } @@ -206,8 +232,31 @@ inline void TensorFromVector(const std::vector& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, - src_place, src_ptr, size, nullptr); + // 1. vector -> npu pinned tensor + platform::NPUPinnedPlace npu_pinned_place; + Tensor npu_pinned_tensor; + npu_pinned_tensor.Resize(dst->dims()); + auto npu_pinned_ptr = + npu_pinned_tensor.mutable_data(npu_pinned_place, dst->type()); + memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size); + + // 2. async copy npu pinned tensor -> npu tensor + memory::Copy( + BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + npu_pinned_place, npu_pinned_ptr, size, + reinterpret_cast(ctx).stream()); + + // 3. record event + auto npu_pinned_allocator = + static_cast( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(npu_pinned_place) + .get()); + paddle::memory::allocation::Allocation* allocation = + npu_pinned_tensor.Holder().get(); + npu_pinned_allocator->RecordEvent( + allocation, + reinterpret_cast(ctx).stream()); } #endif delete[] array; diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index cfc933c7a76fa77dca3bf368a3e55cc1c7485bea..79d77235b7c81b75d00336d7198e836c18eb3347 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -21,6 +21,7 @@ limitations under the License. */ #include #include "acl/acl.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { @@ -30,6 +31,7 @@ using Tensor = framework::Tensor; using DataLayout = framework::DataLayout; using NPUAttribute = framework::NPUAttribute; using NPUAttributeMap = framework::NPUAttributeMap; +using DeviceContextPool = platform::DeviceContextPool; class NpuOpRunner { public: @@ -90,41 +92,42 @@ aclrtStream GetCurrentNPUStream(int device_id = -1); template void FillNpuTensorWithConstant(Tensor *tensor, T val) { - // NOTE(zhiqiu): we found that power sometimes returns 0 when val is small - // like 1e-8. - constexpr float MIN_PRECISION_FOR_POWER = 1e-3; PADDLE_ENFORCE_EQ( tensor->IsInitialized(), true, platform::errors::InvalidArgument("The tensor should be initialized.")); PADDLE_ENFORCE_EQ( platform::is_npu_place(tensor->place()), true, platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); - // do async for better performance - if ((typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) && - static_cast(val) > MIN_PRECISION_FOR_POWER) { - Tensor tmp(tensor->type()); - tmp.Resize(tensor->dims()); - tmp.mutable_data(tensor->place()); - auto stream = GetCurrentNPUStream( - BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device); - platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), - stream); - auto runner = NpuOpRunner("Power", {tmp}, {*tensor}, - {{"power", static_cast(1)}, - {"scale", static_cast(0)}, - {"shift", static_cast(val)}}); - runner.Run(stream); - } else { - T *array = new T[tensor->numel()]; - for (unsigned int i = 0; i < tensor->numel(); ++i) { - array[i] = static_cast(val); - } - std::vector vec(tensor->numel(), static_cast(val)); - // do sync copy + + int numel = tensor->numel(); + if (numel == 1) { + Tensor npu_pinned_tensor(tensor->type()); + platform::NPUPinnedPlace npu_pinned_place; + auto npu_pinned_ptr = + npu_pinned_tensor.mutable_data({1}, npu_pinned_place); + *npu_pinned_ptr = val; + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), - tensor->data(), platform::CPUPlace(), array, - tensor->numel() * sizeof(T), nullptr); - delete[] array; + tensor->data(), npu_pinned_place, npu_pinned_ptr, + sizeof(T), GetCurrentNPUStream()); + + auto npu_pinned_allocator = + static_cast( + paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(npu_pinned_place) + .get()); + paddle::memory::allocation::Allocation *allocation = + npu_pinned_tensor.Holder().get(); + + npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream()); + } else { + std::vector vec(numel, static_cast(val)); + auto device_id = platform::GetCurrentNPUDeviceId(); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = static_cast( + pool.Get(platform::NPUPlace(device_id))); + + paddle::framework::TensorFromVector(vec, *dev_ctx, tensor); } }