[NPU] Support async copy for TensorFromVector with event (#32563)

85512d60 · liym27 · GitHub · f1d63029 · 85512d60 · 85512d60
隐藏空白更改
内联并排

Showing with 84 addition and 32 deletion

paddle/fluid/framework/tensor_util.h paddle/fluid/framework/tensor_util.h +53 -4

paddle/fluid/operators/npu_op_runner.h paddle/fluid/operators/npu_op_runner.h +31 -28

未找到文件。
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -19,6 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/dlpack_tensor.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#ifdef PADDLE_WITH_ASCEND_CL
+#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
+#endif
 #include "paddle/fluid/platform/device_context.h"

 namespace paddle {
@@ -166,8 +170,30 @@ void TensorFromVector(const std::vector<T>& src,
  // Since vector is on cpu, I think this function should be a "sync" operation,
  // so pass nullptr as stream to  memory::Copy().
  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size, nullptr);
+    //  1. vector -> npu pinned tensor
+    Tensor npu_pinned_tensor(dst->type());
+    platform::NPUPinnedPlace npu_pinned_place;
+    auto npu_pinned_ptr =
+        npu_pinned_tensor.mutable_data<T>(dst->dims(), npu_pinned_place);
+    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
+
+    //  2. async copy npu pinned tensor -> npu tensor
+    memory::Copy(
+        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+        npu_pinned_place, npu_pinned_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+
+    //  3. record event
+    auto npu_pinned_allocator =
+        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
+            paddle::memory::allocation::AllocatorFacade::Instance()
+                .GetAllocator(npu_pinned_place)
+                .get());
+    paddle::memory::allocation::Allocation* allocation =
+        npu_pinned_tensor.Holder().get();
+    npu_pinned_allocator->RecordEvent(
+        allocation,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
  }
 #endif
 }
@@ -206,8 +232,31 @@ inline void TensorFromVector(const std::vector<bool>& src,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-                 src_place, src_ptr, size, nullptr);
+    //  1. vector -> npu pinned tensor
+    platform::NPUPinnedPlace npu_pinned_place;
+    Tensor npu_pinned_tensor;
+    npu_pinned_tensor.Resize(dst->dims());
+    auto npu_pinned_ptr =
+        npu_pinned_tensor.mutable_data(npu_pinned_place, dst->type());
+    memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
+
+    //  2. async copy npu pinned tensor -> npu tensor
+    memory::Copy(
+        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
+        npu_pinned_place, npu_pinned_ptr, size,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
+
+    //  3. record event
+    auto npu_pinned_allocator =
+        static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
+            paddle::memory::allocation::AllocatorFacade::Instance()
+                .GetAllocator(npu_pinned_place)
+                .get());
+    paddle::memory::allocation::Allocation* allocation =
+        npu_pinned_tensor.Holder().get();
+    npu_pinned_allocator->RecordEvent(
+        allocation,
+        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
  }
 #endif
  delete[] array;

--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <vector>

 #include "acl/acl.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/npu_op_runner.h"

 namespace paddle {
@@ -30,6 +31,7 @@ using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
 using NPUAttribute = framework::NPUAttribute;
 using NPUAttributeMap = framework::NPUAttributeMap;
+using DeviceContextPool = platform::DeviceContextPool;

 class NpuOpRunner {
 public:
@@ -90,41 +92,42 @@ aclrtStream GetCurrentNPUStream(int device_id = -1);

 template <typename T>
 void FillNpuTensorWithConstant(Tensor *tensor, T val) {
-  // NOTE(zhiqiu): we found that power sometimes returns 0 when val is small
-  // like 1e-8.
-  constexpr float MIN_PRECISION_FOR_POWER = 1e-3;
  PADDLE_ENFORCE_EQ(
      tensor->IsInitialized(), true,
      platform::errors::InvalidArgument("The tensor should be initialized."));
  PADDLE_ENFORCE_EQ(
      platform::is_npu_place(tensor->place()), true,
      platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
-  // do async for better performance
-  if ((typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) &&
-      static_cast<float>(val) > MIN_PRECISION_FOR_POWER) {
-    Tensor tmp(tensor->type());
-    tmp.Resize(tensor->dims());
-    tmp.mutable_data<T>(tensor->place());
-    auto stream = GetCurrentNPUStream(
-        BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
-    platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
-                             stream);
-    auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
-                              {{"power", static_cast<float>(1)},
-                               {"scale", static_cast<float>(0)},
-                               {"shift", static_cast<float>(val)}});
-    runner.Run(stream);
-  } else {
-    T *array = new T[tensor->numel()];
-    for (unsigned int i = 0; i < tensor->numel(); ++i) {
-      array[i] = static_cast<T>(val);
-    }
-    std::vector<T> vec(tensor->numel(), static_cast<T>(val));
-    // do sync copy
+
+  int numel = tensor->numel();
+  if (numel == 1) {
+    Tensor npu_pinned_tensor(tensor->type());
+    platform::NPUPinnedPlace npu_pinned_place;
+    auto npu_pinned_ptr =
+        npu_pinned_tensor.mutable_data<T>({1}, npu_pinned_place);
+    *npu_pinned_ptr = val;
+
    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
-                 tensor->data<void>(), platform::CPUPlace(), array,
-                 tensor->numel() * sizeof(T), nullptr);
-    delete[] array;
+                 tensor->data<void>(), npu_pinned_place, npu_pinned_ptr,
+                 sizeof(T), GetCurrentNPUStream());
+
+    auto npu_pinned_allocator =
+        static_cast<paddle::memory::allocation::NPUPinnedAllocator *>(
+            paddle::memory::allocation::AllocatorFacade::Instance()
+                .GetAllocator(npu_pinned_place)
+                .get());
+    paddle::memory::allocation::Allocation *allocation =
+        npu_pinned_tensor.Holder().get();
+
+    npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
+  } else {
+    std::vector<T> vec(numel, static_cast<T>(val));
+    auto device_id = platform::GetCurrentNPUDeviceId();
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
+        pool.Get(platform::NPUPlace(device_id)));
+
+    paddle::framework::TensorFromVector<T>(vec, *dev_ctx, tensor);
  }
 }