[NPU] enable async copy and add wait before sync operation (#31956)

* enable async copy and add wait before sync operation * remove unneccessary wait * add FillNpuTensorWithConstant * refine * fix fill_constant * make TensorFromVector/TensorToVector sync

[NPU] enable async copy and add wait before sync operation (#31956)
* enable async copy and add wait before sync operation * remove unneccessary wait * add FillNpuTensorWithConstant * refine * fix fill_constant * make TensorFromVector/TensorToVector sync
2a672f68 · Leo Chen · GitHub · efa85f8c · 2a672f68 · 2a672f68
7 changed file
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -160,11 +160,15 @@ void TensorFromVector(const std::vector<T>& src,
  }
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
+  // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
+  // cudaMemcpyAsync.
+  // cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
+  // aclrtMemcpyAsync is really "async" between cpu <-> npu.
+  // Since vector is on cpu, I think this function should be a "sync" operation,
+  // so pass nullptr as stream to  memory::Copy().
  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    memory::Copy(
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
+                 src_place, src_ptr, size, nullptr);
-        src_ptr, size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
  }
 #endif
 }
@@ -203,10 +207,8 @@ inline void TensorFromVector(const std::vector<bool>& src,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  else if (platform::is_npu_place(dst_place)) {  // NOLINT
-    memory::Copy(
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
-        BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place,
+                 src_place, src_ptr, size, nullptr);
-        src_ptr, size,
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
  }
 #endif
  delete[] array;
@@ -266,10 +268,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(
+    memory::Copy(dst_place, dst_ptr,
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
+                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
-        src_ptr, size,
+                 size, nullptr);
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
  }
 #endif
 }
@@ -302,10 +303,9 @@ inline void TensorToVector(const Tensor& src,
 #endif
 #ifdef PADDLE_WITH_ASCEND_CL
  else if (platform::is_npu_place(src.place())) {  // NOLINT
-    memory::Copy(
+    memory::Copy(dst_place, dst_ptr,
-        dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()),
+                 BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
-        src_ptr, size,
+                 size, nullptr);
-        reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
  }
 #endif
  for (unsigned int i = 0; i < src.numel(); i++) {

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -209,12 +209,6 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
  platform::SetNPUDeviceId(dst_place.device);
-  // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async,
-  // which is different from CUDA. In Paddle, when async is called, "sync"
-  // is run actually, which means Paddle doesn't fully supported async.
-  // TODO(ascendrc): Support NPU memcpy async for better performance.
-  stream = nullptr;
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
          << dst_place << " by thream(" << stream << ")";
@@ -222,6 +216,12 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
    platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
  } else {
+    // On NPU, async operation after sync operation is ok, while sync operation
+    // after async is not ok, since the async operation may not done.
+    // So, its needed to do wait before sync operation.
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
    platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
  }
@@ -237,12 +237,6 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
  platform::SetNPUDeviceId(src_place.device);
-  // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async,
-  // which is different from CUDA. In Paddle, when async is called, "sync"
-  // is run actually, which means Paddle doesn't fully supported async.
-  // TODO(ascendrc): Support NPU memcpy async for better performance.
-  stream = nullptr;
  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
          << dst_place << " by thream(" << stream << ")";
@@ -250,6 +244,9 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
    platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
    platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
  } else {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
    platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
    platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
  }
@@ -272,6 +269,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                               stream);
    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
      platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
    }
@@ -286,6 +287,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
      platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
                               stream);
    } else {
+      platform::DeviceContextPool& pool =
+          platform::DeviceContextPool::Instance();
+      static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
      platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
      platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
    }

--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
  TensorFromVector(init_y, ctx, tensor_y);
  tensor_y->Resize({10, 10});
-  ctx.Wait();
  auto place = ctx.GetPlace();
  auto out = scope->Var("Out");
  auto tensor_out = out->GetMutable<f::LoDTensor>();
@@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
                                    {{"Out", {"Out"}}}, attrs);
  op->Run(*scope, place);
-  ctx.Wait();
  std::vector<T> out_vec;
  TensorToVector(*tensor_out, ctx, &out_vec);
@@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
  TensorFromVector(init_dout, ctx, tensor_dout);
  tensor_dout->Resize({2, 3, 5});
-  ctx.Wait();
  // run
  f::AttributeMap attrs;
  auto op = f::OpRegistry::CreateOp(
@@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
  auto place = ctx.GetPlace();
  op->Run(*scope, place);
-  ctx.Wait();
  std::vector<T> dx_vec;
  TensorToVector(*tensor_dx, ctx, &dx_vec);

--- a/paddle/fluid/operators/fill_constant_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_op_npu.cc
@@ -65,8 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
    Tensor tensor_tmp(data_type);
    tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
-    std::vector<T> init = {value};
+    FillNpuTensorWithConstant<T>(&tensor_tmp, value);
-    TensorFromVector(init, ctx.device_context(), &tensor_tmp);
    out_var->mutable_data<T>(shape, place);
    auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},

--- a/paddle/fluid/operators/npu_op_runner.cc
+++ b/paddle/fluid/operators/npu_op_runner.cc
@@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
  return iter->second;
 }
-aclrtStream GetCurrentNPUStream() {
+aclrtStream GetCurrentNPUStream(int device_id) {
-  int device_id = platform::GetCurrentNPUDeviceId();
+  if (device_id == -1) {
+    device_id = platform::GetCurrentNPUDeviceId();
+  }
  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
  auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
      pool.Get(platform::NPUPlace(device_id)));
@@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) {
  VLOG(4) << "after aclopCompileAndExecute: " << ret;
  PADDLE_ENFORCE_NPU_SUCCESS(ret);
 }
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/npu_op_runner.h
+++ b/paddle/fluid/operators/npu_op_runner.h
@@ -86,6 +86,44 @@ class NpuOpRunner {
 aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
+aclrtStream GetCurrentNPUStream(int device_id = -1);
+template <typename T>
+void FillNpuTensorWithConstant(Tensor *tensor, T val) {
+  PADDLE_ENFORCE_EQ(
+      tensor->IsInitialized(), true,
+      platform::errors::InvalidArgument("The tensor should be initialized."));
+  PADDLE_ENFORCE_EQ(
+      platform::is_npu_place(tensor->place()), true,
+      platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
+  // do async for better performance
+  if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) {
+    Tensor tmp(tensor->type());
+    tmp.Resize(tensor->dims());
+    tmp.mutable_data<T>(tensor->place());
+    auto stream = GetCurrentNPUStream(
+        BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
+    platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
+                             stream);
+    auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
+                              {{"power", static_cast<float>(1)},
+                               {"scale", static_cast<float>(0)},
+                               {"shift", static_cast<float>(val)}});
+    runner.Run(stream);
+  } else {
+    T *array = new T[tensor->numel()];
+    for (unsigned int i = 0; i < tensor->numel(); ++i) {
+      array[i] = static_cast<T>(val);
+    }
+    std::vector<T> vec(tensor->numel(), static_cast<T>(val));
+    // do sync copy
+    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
+                 tensor->data<void>(), platform::CPUPlace(), array,
+                 tensor->numel() * sizeof(T), nullptr);
+    delete[] array;
+  }
+}
 }  // namespace operators
 }  // namespace paddle
 #endif
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -255,6 +255,7 @@ NPUDeviceContext::~NPUDeviceContext() {
 void NPUDeviceContext::Wait() const {
  platform::RecordEvent record_event("NPUDeviceContext/wait");
  NPUDeviceGuard guard(place_.device);
+  VLOG(4) << "NPU context Wait";
  PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
 }