From cbf22d65995f8c1ffd0906cc8ba0596f98e87fc8 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 7 Jul 2021 18:23:53 +0800 Subject: [PATCH] [NPU] NpuOpRunner supports host tensor as input (#33992) * NpuOpRunner supports host tensor as input * fix compile issue --- .../fluid/operators/lookup_table_v2_op_npu.cc | 14 ++-- paddle/fluid/operators/npu_op_runner.cc | 64 +++++++++++++++++-- paddle/fluid/operators/npu_op_runner.h | 26 ++++++-- 3 files changed, 85 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index b4a861ed19c..686ffc98de7 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -39,14 +39,14 @@ class LookupTableV2NPUKernel : public framework::OpKernel { table_var->IsType(), true, platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); - framework::NPUAttributeMap attr_input = {{"validate_indices", false}}; - const auto &runner = - NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input); - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + NpuOpRunner runner; + runner.SetType("GatherV2") + .AddInput(*table_t) + .AddInput(*ids_t) + .AddInput(std::vector{0}) + .AddOutput(*output_t); + runner.Run(); } }; diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index a6ea656cfcd..25ef24d04d2 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -74,15 +74,15 @@ aclrtStream GetCurrentNPUStream(int device_id) { return dev_ctx->stream(); } -NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) { - attr_ = aclopCreateAttr(); -} +NpuOpRunner::NpuOpRunner() {} + +NpuOpRunner::NpuOpRunner(const std::string &op_type) : op_type_(op_type) {} -NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector &inputs, +NpuOpRunner::NpuOpRunner(const std::string &op_type, + const std::vector &inputs, const std::vector &outputs, const NPUAttributeMap &attrs) : op_type_(op_type) { - attr_ = aclopCreateAttr(); AddInputs(inputs); AddOutputs(outputs); AddAttrs(attrs); @@ -108,8 +108,16 @@ NpuOpRunner::~NpuOpRunner() { const std::string &NpuOpRunner::Type() { return op_type_; } +NpuOpRunner &NpuOpRunner::SetType(const std::string &name) { + op_type_ = name; + return *this; +} + NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, const NPUAttribute &attr) { + if (!attr_) { + attr_ = aclopCreateAttr(); + } if (attr.type() == typeid(bool)) { PADDLE_ENFORCE_NPU_SUCCESS( aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr))); @@ -191,6 +199,46 @@ NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) { return *this; } +NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor, aclMemType mem_type) { + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(tensor, mem_type)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(tensor)); + return *this; +} + +NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(platform::CPUPlace())); + Tensor host_tensor; + TensorFromVector(dims, *dev_ctx, &host_tensor); + host_tensors_.emplace_back(host_tensor); + + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(host_tensor)); + + return *this; +} + +NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(platform::CPUPlace())); + Tensor host_tensor; + TensorFromVector(dims, *dev_ctx, &host_tensor); + host_tensors_.emplace_back(host_tensor); + + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(host_tensor)); + + return *this; +} + NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) { // create aclTensorDesc output_descs_.emplace_back(CreateTensorDesc(tensor)); @@ -272,7 +320,8 @@ std::vector &NpuOpRunner::GetOutputBuffers() { return output_buffers_; } -aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { +aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor, + aclMemType mem_type) { auto dtype = ConvertToNpuDtype(tensor.type()); auto format = ConvertToNpuFormat(tensor.layout()); auto dims = framework::vectorize(tensor.dims()); @@ -287,6 +336,9 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format)); PADDLE_ENFORCE_NPU_SUCCESS( aclSetTensorStorageShape(desc, dims.size(), dims.data())); + if (mem_type == ACL_MEMTYPE_HOST) { + PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorPlaceMent(desc, mem_type)); + } return desc; } diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index a637935c749..2257c209550 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -35,11 +35,12 @@ using DeviceContextPool = platform::DeviceContextPool; class NpuOpRunner { public: - explicit NpuOpRunner(std::string op_type); - explicit NpuOpRunner(std::string op_type, - const std::vector &inputs = {}, - const std::vector &outputs = {}, - const NPUAttributeMap &attrs = {}); + NpuOpRunner(); + explicit NpuOpRunner(const std::string &op_type); + NpuOpRunner(const std::string &op_type, + const std::vector &inputs = {}, + const std::vector &outputs = {}, + const NPUAttributeMap &attrs = {}); // NOTE(zhiqiu): why forbid copy and operator= ? // Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner, @@ -53,12 +54,23 @@ class NpuOpRunner { const std::string &Type(); + NpuOpRunner &SetType(const std::string &name); + NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr); NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs); NpuOpRunner &AddInput(const Tensor &tensor); + // NOTE(zhiqiu): CANN-5.0.2 support input tensors on host. + // Specifically, the tensor of shape, tensor of dims, etc, which are are small + // vector/list. + NpuOpRunner &AddInput(const Tensor &tensor, aclMemType mem_type); + + NpuOpRunner &AddInput(std::vector &&dims); + + NpuOpRunner &AddInput(std::vector &&dims); + NpuOpRunner &AddOutput(const Tensor &tensor); NpuOpRunner &AddInputs(const std::vector &tensors); @@ -82,7 +94,8 @@ class NpuOpRunner { void Run(aclrtStream stream = nullptr) const; private: - aclTensorDesc *CreateTensorDesc(Tensor tensor); + aclTensorDesc *CreateTensorDesc(Tensor tensor, + aclMemType mem_type = ACL_MEMTYPE_DEVICE); aclDataBuffer *CreateDataBuffer(Tensor tensor); private: @@ -91,6 +104,7 @@ class NpuOpRunner { std::vector output_buffers_; std::vector input_descs_; std::vector output_descs_; + std::vector host_tensors_; aclopAttr *attr_{nullptr}; }; -- GitLab