未验证 提交 2a672f68 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] enable async copy and add wait before sync operation (#31956)

* enable async copy and  add wait before sync operation

* remove unneccessary wait

* add FillNpuTensorWithConstant

* refine

* fix fill_constant

* make TensorFromVector/TensorToVector sync
上级 efa85f8c
...@@ -160,11 +160,15 @@ void TensorFromVector(const std::vector<T>& src, ...@@ -160,11 +160,15 @@ void TensorFromVector(const std::vector<T>& src,
} }
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
// NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from
// cudaMemcpyAsync.
// cudaMemcpyAsync is actually "sync" between cpu <-> gpu.
// aclrtMemcpyAsync is really "async" between cpu <-> npu.
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else if (platform::is_npu_place(dst_place)) { // NOLINT else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, src_place, src_ptr, size, nullptr);
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
} }
#endif #endif
} }
...@@ -203,10 +207,8 @@ inline void TensorFromVector(const std::vector<bool>& src, ...@@ -203,10 +207,8 @@ inline void TensorFromVector(const std::vector<bool>& src,
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy( memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, src_place, src_ptr, size, nullptr);
src_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
} }
#endif #endif
delete[] array; delete[] array;
...@@ -266,10 +268,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, ...@@ -266,10 +268,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(dst_place, dst_ptr,
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
src_ptr, size, size, nullptr);
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
} }
#endif #endif
} }
...@@ -302,10 +303,9 @@ inline void TensorToVector(const Tensor& src, ...@@ -302,10 +303,9 @@ inline void TensorToVector(const Tensor& src,
#endif #endif
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(src.place())) { // NOLINT else if (platform::is_npu_place(src.place())) { // NOLINT
memory::Copy( memory::Copy(dst_place, dst_ptr,
dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr,
src_ptr, size, size, nullptr);
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
} }
#endif #endif
for (unsigned int i = 0; i < src.numel(); i++) { for (unsigned int i = 0; i < src.numel(); i++) {
......
...@@ -209,12 +209,6 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place, ...@@ -209,12 +209,6 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
platform::SetNPUDeviceId(dst_place.device); platform::SetNPUDeviceId(dst_place.device);
// NOTE(ascendrc): NPU memcpy async from host to device is a "real" async,
// which is different from CUDA. In Paddle, when async is called, "sync"
// is run actually, which means Paddle doesn't fully supported async.
// TODO(ascendrc): Support NPU memcpy async for better performance.
stream = nullptr;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
...@@ -222,6 +216,12 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place, ...@@ -222,6 +216,12 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream);
} else { } else {
// On NPU, async operation after sync operation is ok, while sync operation
// after async is not ok, since the async operation may not done.
// So, its needed to do wait before sync operation.
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE);
} }
...@@ -237,12 +237,6 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place, ...@@ -237,12 +237,6 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
platform::SetNPUDeviceId(src_place.device); platform::SetNPUDeviceId(src_place.device);
// NOTE(ascendrc): NPU memcpy async from device to host is a "real" async,
// which is different from CUDA. In Paddle, when async is called, "sync"
// is run actually, which means Paddle doesn't fully supported async.
// TODO(ascendrc): Support NPU memcpy async for better performance.
stream = nullptr;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place << " by thream(" << stream << ")"; << dst_place << " by thream(" << stream << ")";
...@@ -250,6 +244,9 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place, ...@@ -250,6 +244,9 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU");
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream);
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST);
} }
...@@ -272,6 +269,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -272,6 +269,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream); stream);
} else { } else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
} }
...@@ -286,6 +287,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -286,6 +287,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
stream); stream);
} else { } else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
static_cast<platform::NPUDeviceContext*>(pool.Get(dst_place))->Wait();
platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU");
platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE);
} }
......
...@@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
TensorFromVector(init_y, ctx, tensor_y); TensorFromVector(init_y, ctx, tensor_y);
tensor_y->Resize({10, 10}); tensor_y->Resize({10, 10});
ctx.Wait();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto out = scope->Var("Out"); auto out = scope->Var("Out");
auto tensor_out = out->GetMutable<f::LoDTensor>(); auto tensor_out = out->GetMutable<f::LoDTensor>();
...@@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx,
{{"Out", {"Out"}}}, attrs); {{"Out", {"Out"}}}, attrs);
op->Run(*scope, place); op->Run(*scope, place);
ctx.Wait();
std::vector<T> out_vec; std::vector<T> out_vec;
TensorToVector(*tensor_out, ctx, &out_vec); TensorToVector(*tensor_out, ctx, &out_vec);
...@@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
TensorFromVector(init_dout, ctx, tensor_dout); TensorFromVector(init_dout, ctx, tensor_dout);
tensor_dout->Resize({2, 3, 5}); tensor_dout->Resize({2, 3, 5});
ctx.Wait();
// run // run
f::AttributeMap attrs; f::AttributeMap attrs;
auto op = f::OpRegistry::CreateOp( auto op = f::OpRegistry::CreateOp(
...@@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, ...@@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
op->Run(*scope, place); op->Run(*scope, place);
ctx.Wait();
std::vector<T> dx_vec; std::vector<T> dx_vec;
TensorToVector(*tensor_dx, ctx, &dx_vec); TensorToVector(*tensor_dx, ctx, &dx_vec);
......
...@@ -65,8 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> { ...@@ -65,8 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel<T> {
Tensor tensor_tmp(data_type); Tensor tensor_tmp(data_type);
tensor_tmp.mutable_data<T>({1}, ctx.GetPlace()); tensor_tmp.mutable_data<T>({1}, ctx.GetPlace());
std::vector<T> init = {value}; FillNpuTensorWithConstant<T>(&tensor_tmp, value);
TensorFromVector(init, ctx.device_context(), &tensor_tmp);
out_var->mutable_data<T>(shape, place); out_var->mutable_data<T>(shape, place);
auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var},
......
...@@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { ...@@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) {
return iter->second; return iter->second;
} }
aclrtStream GetCurrentNPUStream() { aclrtStream GetCurrentNPUStream(int device_id) {
int device_id = platform::GetCurrentNPUDeviceId(); if (device_id == -1) {
device_id = platform::GetCurrentNPUDeviceId();
}
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::NPUDeviceContext *>( auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
pool.Get(platform::NPUPlace(device_id))); pool.Get(platform::NPUPlace(device_id)));
...@@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) { ...@@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) {
VLOG(4) << "after aclopCompileAndExecute: " << ret; VLOG(4) << "after aclopCompileAndExecute: " << ret;
PADDLE_ENFORCE_NPU_SUCCESS(ret); PADDLE_ENFORCE_NPU_SUCCESS(ret);
} }
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -86,6 +86,44 @@ class NpuOpRunner { ...@@ -86,6 +86,44 @@ class NpuOpRunner {
aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype); aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype);
aclrtStream GetCurrentNPUStream(int device_id = -1);
template <typename T>
void FillNpuTensorWithConstant(Tensor *tensor, T val) {
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("The tensor should be initialized."));
PADDLE_ENFORCE_EQ(
platform::is_npu_place(tensor->place()), true,
platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
// do async for better performance
if (typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) {
Tensor tmp(tensor->type());
tmp.Resize(tensor->dims());
tmp.mutable_data<T>(tensor->place());
auto stream = GetCurrentNPUStream(
BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
stream);
auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(val)}});
runner.Run(stream);
} else {
T *array = new T[tensor->numel()];
for (unsigned int i = 0; i < tensor->numel(); ++i) {
array[i] = static_cast<T>(val);
}
std::vector<T> vec(tensor->numel(), static_cast<T>(val));
// do sync copy
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
tensor->data<void>(), platform::CPUPlace(), array,
tensor->numel() * sizeof(T), nullptr);
delete[] array;
}
}
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif #endif
...@@ -255,6 +255,7 @@ NPUDeviceContext::~NPUDeviceContext() { ...@@ -255,6 +255,7 @@ NPUDeviceContext::~NPUDeviceContext() {
void NPUDeviceContext::Wait() const { void NPUDeviceContext::Wait() const {
platform::RecordEvent record_event("NPUDeviceContext/wait"); platform::RecordEvent record_event("NPUDeviceContext/wait");
NPUDeviceGuard guard(place_.device); NPUDeviceGuard guard(place_.device);
VLOG(4) << "NPU context Wait";
PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice());
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册