未验证 提交 85512d60 编写于 作者: L liym27 提交者: GitHub

[NPU] Support async copy for TensorFromVector with event (#32563)

上级 f1d63029
......@@ -19,6 +19,10 @@ limitations under the License. */
#include "paddle/fluid/framework/dlpack_tensor.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
#endif
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
......@@ -166,8 +170,30 @@ void TensorFromVector(const std::vector<T>& src,
// Since vector is on cpu, I think this function should be a "sync" operation,
// so pass nullptr as stream to memory::Copy().
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
// 1. vector -> npu pinned tensor
Tensor npu_pinned_tensor(dst->type());
platform::NPUPinnedPlace npu_pinned_place;
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data<T>(dst->dims(), npu_pinned_place);
memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
paddle::memory::allocation::Allocation* allocation =
npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
}
......@@ -206,8 +232,31 @@ inline void TensorFromVector(const std::vector<bool>& src,
#endif
#ifdef PADDLE_WITH_ASCEND_CL
else if (platform::is_npu_place(dst_place)) { // NOLINT
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
src_place, src_ptr, size, nullptr);
// 1. vector -> npu pinned tensor
platform::NPUPinnedPlace npu_pinned_place;
Tensor npu_pinned_tensor;
npu_pinned_tensor.Resize(dst->dims());
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data(npu_pinned_place, dst->type());
memory::Copy(npu_pinned_place, npu_pinned_ptr, src_place, src_ptr, size);
// 2. async copy npu pinned tensor -> npu tensor
memory::Copy(
BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr,
npu_pinned_place, npu_pinned_ptr, size,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
// 3. record event
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator*>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
paddle::memory::allocation::Allocation* allocation =
npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(
allocation,
reinterpret_cast<const platform::NPUDeviceContext&>(ctx).stream());
}
#endif
delete[] array;
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include <vector>
#include "acl/acl.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/npu_op_runner.h"
namespace paddle {
......@@ -30,6 +31,7 @@ using Tensor = framework::Tensor;
using DataLayout = framework::DataLayout;
using NPUAttribute = framework::NPUAttribute;
using NPUAttributeMap = framework::NPUAttributeMap;
using DeviceContextPool = platform::DeviceContextPool;
class NpuOpRunner {
public:
......@@ -90,41 +92,42 @@ aclrtStream GetCurrentNPUStream(int device_id = -1);
template <typename T>
void FillNpuTensorWithConstant(Tensor *tensor, T val) {
// NOTE(zhiqiu): we found that power sometimes returns 0 when val is small
// like 1e-8.
constexpr float MIN_PRECISION_FOR_POWER = 1e-3;
PADDLE_ENFORCE_EQ(
tensor->IsInitialized(), true,
platform::errors::InvalidArgument("The tensor should be initialized."));
PADDLE_ENFORCE_EQ(
platform::is_npu_place(tensor->place()), true,
platform::errors::InvalidArgument("The tensor should be on NPUPlace."));
// do async for better performance
if ((typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) &&
static_cast<float>(val) > MIN_PRECISION_FOR_POWER) {
Tensor tmp(tensor->type());
tmp.Resize(tensor->dims());
tmp.mutable_data<T>(tensor->place());
auto stream = GetCurrentNPUStream(
BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device);
platform::NPUMemsetAsync(tmp.data<void>(), 0, tmp.numel() * sizeof(T),
stream);
auto runner = NpuOpRunner("Power", {tmp}, {*tensor},
{{"power", static_cast<float>(1)},
{"scale", static_cast<float>(0)},
{"shift", static_cast<float>(val)}});
runner.Run(stream);
} else {
T *array = new T[tensor->numel()];
for (unsigned int i = 0; i < tensor->numel(); ++i) {
array[i] = static_cast<T>(val);
}
std::vector<T> vec(tensor->numel(), static_cast<T>(val));
// do sync copy
int numel = tensor->numel();
if (numel == 1) {
Tensor npu_pinned_tensor(tensor->type());
platform::NPUPinnedPlace npu_pinned_place;
auto npu_pinned_ptr =
npu_pinned_tensor.mutable_data<T>({1}, npu_pinned_place);
*npu_pinned_ptr = val;
memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()),
tensor->data<void>(), platform::CPUPlace(), array,
tensor->numel() * sizeof(T), nullptr);
delete[] array;
tensor->data<void>(), npu_pinned_place, npu_pinned_ptr,
sizeof(T), GetCurrentNPUStream());
auto npu_pinned_allocator =
static_cast<paddle::memory::allocation::NPUPinnedAllocator *>(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(npu_pinned_place)
.get());
paddle::memory::allocation::Allocation *allocation =
npu_pinned_tensor.Holder().get();
npu_pinned_allocator->RecordEvent(allocation, GetCurrentNPUStream());
} else {
std::vector<T> vec(numel, static_cast<T>(val));
auto device_id = platform::GetCurrentNPUDeviceId();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx = static_cast<platform::NPUDeviceContext *>(
pool.Get(platform::NPUPlace(device_id)));
paddle::framework::TensorFromVector<T>(vec, *dev_ctx, tensor);
}
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册