From 2a672f68e8079be10d0a34dd2a43fe2367d5de0b Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 1 Apr 2021 17:06:34 +0800 Subject: [PATCH] [NPU] enable async copy and add wait before sync operation (#31956) * enable async copy and add wait before sync operation * remove unneccessary wait * add FillNpuTensorWithConstant * refine * fix fill_constant * make TensorFromVector/TensorToVector sync --- paddle/fluid/framework/tensor_util.h | 32 ++++++++-------- paddle/fluid/memory/memcpy.cc | 29 ++++++++------ .../elementwise/elementwise_op_npu_test.cc | 6 --- .../fluid/operators/fill_constant_op_npu.cc | 3 +- paddle/fluid/operators/npu_op_runner.cc | 7 +++- paddle/fluid/operators/npu_op_runner.h | 38 +++++++++++++++++++ paddle/fluid/platform/device_context.cc | 1 + 7 files changed, 78 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 1df4cf2638c..cab4744ed0d 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -160,11 +160,15 @@ void TensorFromVector(const std::vector& src, } #endif #ifdef PADDLE_WITH_ASCEND_CL + // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from + // cudaMemcpyAsync. + // cudaMemcpyAsync is actually "sync" between cpu <-> gpu. + // aclrtMemcpyAsync is really "async" between cpu <-> npu. + // Since vector is on cpu, I think this function should be a "sync" operation, + // so pass nullptr as stream to memory::Copy(). else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); } #endif } @@ -203,10 +207,8 @@ inline void TensorFromVector(const std::vector& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); } #endif delete[] array; @@ -266,10 +268,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); } #endif } @@ -302,10 +303,9 @@ inline void TensorToVector(const Tensor& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); } #endif for (unsigned int i = 0; i < src.numel(); i++) { diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index d616051ea6d..c76d448013f 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -209,12 +209,6 @@ void Copy(platform::NPUPlace dst_place, platform::SetNPUDeviceId(dst_place.device); - // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async, - // which is different from CUDA. In Paddle, when async is called, "sync" - // is run actually, which means Paddle doesn't fully supported async. - // TODO(ascendrc): Support NPU memcpy async for better performance. - stream = nullptr; - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; @@ -222,6 +216,12 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); } else { + // On NPU, async operation after sync operation is ok, while sync operation + // after async is not ok, since the async operation may not done. + // So, its needed to do wait before sync operation. + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); } @@ -237,12 +237,6 @@ void Copy(platform::CPUPlace dst_place, platform::SetNPUDeviceId(src_place.device); - // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async, - // which is different from CUDA. In Paddle, when async is called, "sync" - // is run actually, which means Paddle doesn't fully supported async. - // TODO(ascendrc): Support NPU memcpy async for better performance. - stream = nullptr; - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; @@ -250,6 +244,9 @@ void Copy(platform::CPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } @@ -272,6 +269,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } @@ -286,6 +287,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index df6fae6c848..e6b5e5f8b78 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TensorFromVector(init_y, ctx, tensor_y); tensor_y->Resize({10, 10}); - ctx.Wait(); - auto place = ctx.GetPlace(); auto out = scope->Var("Out"); auto tensor_out = out->GetMutable(); @@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, {{"Out", {"Out"}}}, attrs); op->Run(*scope, place); - ctx.Wait(); std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); @@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, TensorFromVector(init_dout, ctx, tensor_dout); tensor_dout->Resize({2, 3, 5}); - ctx.Wait(); - // run f::AttributeMap attrs; auto op = f::OpRegistry::CreateOp( @@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, auto place = ctx.GetPlace(); op->Run(*scope, place); - ctx.Wait(); std::vector dx_vec; TensorToVector(*tensor_dx, ctx, &dx_vec); diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index eeacd09a9c5..4ea4c11c478 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,8 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { Tensor tensor_tmp(data_type); tensor_tmp.mutable_data({1}, ctx.GetPlace()); - std::vector init = {value}; - TensorFromVector(init, ctx.device_context(), &tensor_tmp); + FillNpuTensorWithConstant(&tensor_tmp, value); out_var->mutable_data(shape, place); auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index aa0c4d2dfd2..276bfa7b328 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { return iter->second; } -aclrtStream GetCurrentNPUStream() { - int device_id = platform::GetCurrentNPUDeviceId(); +aclrtStream GetCurrentNPUStream(int device_id) { + if (device_id == -1) { + device_id = platform::GetCurrentNPUDeviceId(); + } platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast( pool.Get(platform::NPUPlace(device_id))); @@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) { VLOG(4) << "after aclopCompileAndExecute: " << ret; PADDLE_ENFORCE_NPU_SUCCESS(ret); } + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index e178f7fc6e9..5506ddd8969 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -86,6 +86,44 @@ class NpuOpRunner { aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype); +aclrtStream GetCurrentNPUStream(int device_id = -1); + +template +void FillNpuTensorWithConstant(Tensor *tensor, T val) { + PADDLE_ENFORCE_EQ( + tensor->IsInitialized(), true, + platform::errors::InvalidArgument("The tensor should be initialized.")); + PADDLE_ENFORCE_EQ( + platform::is_npu_place(tensor->place()), true, + platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); + // do async for better performance + if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) { + Tensor tmp(tensor->type()); + tmp.Resize(tensor->dims()); + tmp.mutable_data(tensor->place()); + auto stream = GetCurrentNPUStream( + BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device); + platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), + stream); + auto runner = NpuOpRunner("Power", {tmp}, {*tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(val)}}); + runner.Run(stream); + } else { + T *array = new T[tensor->numel()]; + for (unsigned int i = 0; i < tensor->numel(); ++i) { + array[i] = static_cast(val); + } + std::vector vec(tensor->numel(), static_cast(val)); + // do sync copy + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), + tensor->data(), platform::CPUPlace(), array, + tensor->numel() * sizeof(T), nullptr); + delete[] array; + } +} + } // namespace operators } // namespace paddle #endif diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 2a7519706cc..f006b490167 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -255,6 +255,7 @@ NPUDeviceContext::~NPUDeviceContext() { void NPUDeviceContext::Wait() const { platform::RecordEvent record_event("NPUDeviceContext/wait"); NPUDeviceGuard guard(place_.device); + VLOG(4) << "NPU context Wait"; PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); } -- GitLab