未验证 提交 cbf22d65 编写于 作者: L Leo Chen 提交者: GitHub

[NPU] NpuOpRunner supports host tensor as input (#33992)

* NpuOpRunner supports host tensor as input

* fix compile issue
上级 20da7703
...@@ -39,14 +39,14 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> { ...@@ -39,14 +39,14 @@ class LookupTableV2NPUKernel : public framework::OpKernel<T> {
table_var->IsType<framework::LoDTensor>(), true, table_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument("npu only accept LoDTensor")); platform::errors::InvalidArgument("npu only accept LoDTensor"));
output_t->mutable_data<T>(ctx.GetPlace()); output_t->mutable_data<T>(ctx.GetPlace());
framework::NPUAttributeMap attr_input = {{"validate_indices", false}};
const auto &runner = NpuOpRunner runner;
NpuOpRunner("Gather", {*table_t, *ids_t}, {*output_t}, attr_input); runner.SetType("GatherV2")
auto stream = .AddInput(*table_t)
ctx.template device_context<paddle::platform::NPUDeviceContext>() .AddInput(*ids_t)
.stream(); .AddInput(std::vector<int32_t>{0})
runner.Run(stream); .AddOutput(*output_t);
runner.Run();
} }
}; };
......
...@@ -74,15 +74,15 @@ aclrtStream GetCurrentNPUStream(int device_id) { ...@@ -74,15 +74,15 @@ aclrtStream GetCurrentNPUStream(int device_id) {
return dev_ctx->stream(); return dev_ctx->stream();
} }
NpuOpRunner::NpuOpRunner(std::string op_type) : op_type_(op_type) { NpuOpRunner::NpuOpRunner() {}
attr_ = aclopCreateAttr();
} NpuOpRunner::NpuOpRunner(const std::string &op_type) : op_type_(op_type) {}
NpuOpRunner::NpuOpRunner(std::string op_type, const std::vector<Tensor> &inputs, NpuOpRunner::NpuOpRunner(const std::string &op_type,
const std::vector<Tensor> &inputs,
const std::vector<Tensor> &outputs, const std::vector<Tensor> &outputs,
const NPUAttributeMap &attrs) const NPUAttributeMap &attrs)
: op_type_(op_type) { : op_type_(op_type) {
attr_ = aclopCreateAttr();
AddInputs(inputs); AddInputs(inputs);
AddOutputs(outputs); AddOutputs(outputs);
AddAttrs(attrs); AddAttrs(attrs);
...@@ -108,8 +108,16 @@ NpuOpRunner::~NpuOpRunner() { ...@@ -108,8 +108,16 @@ NpuOpRunner::~NpuOpRunner() {
const std::string &NpuOpRunner::Type() { return op_type_; } const std::string &NpuOpRunner::Type() { return op_type_; }
NpuOpRunner &NpuOpRunner::SetType(const std::string &name) {
op_type_ = name;
return *this;
}
NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name,
const NPUAttribute &attr) { const NPUAttribute &attr) {
if (!attr_) {
attr_ = aclopCreateAttr();
}
if (attr.type() == typeid(bool)) { if (attr.type() == typeid(bool)) {
PADDLE_ENFORCE_NPU_SUCCESS( PADDLE_ENFORCE_NPU_SUCCESS(
aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr))); aclopSetAttrBool(attr_, name.c_str(), BOOST_GET_CONST(bool, attr)));
...@@ -191,6 +199,46 @@ NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) { ...@@ -191,6 +199,46 @@ NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor) {
return *this; return *this;
} }
NpuOpRunner &NpuOpRunner::AddInput(const Tensor &tensor, aclMemType mem_type) {
// create aclTensorDesc
input_descs_.emplace_back(CreateTensorDesc(tensor, mem_type));
// create aclDataBuffer
input_buffers_.emplace_back(CreateDataBuffer(tensor));
return *this;
}
NpuOpRunner &NpuOpRunner::AddInput(std::vector<int32_t> &&dims) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
Tensor host_tensor;
TensorFromVector(dims, *dev_ctx, &host_tensor);
host_tensors_.emplace_back(host_tensor);
// create aclTensorDesc
input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST));
// create aclDataBuffer
input_buffers_.emplace_back(CreateDataBuffer(host_tensor));
return *this;
}
NpuOpRunner &NpuOpRunner::AddInput(std::vector<int64_t> &&dims) {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto *dev_ctx =
static_cast<platform::CPUDeviceContext *>(pool.Get(platform::CPUPlace()));
Tensor host_tensor;
TensorFromVector(dims, *dev_ctx, &host_tensor);
host_tensors_.emplace_back(host_tensor);
// create aclTensorDesc
input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST));
// create aclDataBuffer
input_buffers_.emplace_back(CreateDataBuffer(host_tensor));
return *this;
}
NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) { NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) {
// create aclTensorDesc // create aclTensorDesc
output_descs_.emplace_back(CreateTensorDesc(tensor)); output_descs_.emplace_back(CreateTensorDesc(tensor));
...@@ -272,7 +320,8 @@ std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() { ...@@ -272,7 +320,8 @@ std::vector<aclDataBuffer *> &NpuOpRunner::GetOutputBuffers() {
return output_buffers_; return output_buffers_;
} }
aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor,
aclMemType mem_type) {
auto dtype = ConvertToNpuDtype(tensor.type()); auto dtype = ConvertToNpuDtype(tensor.type());
auto format = ConvertToNpuFormat(tensor.layout()); auto format = ConvertToNpuFormat(tensor.layout());
auto dims = framework::vectorize(tensor.dims()); auto dims = framework::vectorize(tensor.dims());
...@@ -287,6 +336,9 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) { ...@@ -287,6 +336,9 @@ aclTensorDesc *NpuOpRunner::CreateTensorDesc(Tensor tensor) {
PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format)); PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorStorageFormat(desc, format));
PADDLE_ENFORCE_NPU_SUCCESS( PADDLE_ENFORCE_NPU_SUCCESS(
aclSetTensorStorageShape(desc, dims.size(), dims.data())); aclSetTensorStorageShape(desc, dims.size(), dims.data()));
if (mem_type == ACL_MEMTYPE_HOST) {
PADDLE_ENFORCE_NPU_SUCCESS(aclSetTensorPlaceMent(desc, mem_type));
}
return desc; return desc;
} }
......
...@@ -35,11 +35,12 @@ using DeviceContextPool = platform::DeviceContextPool; ...@@ -35,11 +35,12 @@ using DeviceContextPool = platform::DeviceContextPool;
class NpuOpRunner { class NpuOpRunner {
public: public:
explicit NpuOpRunner(std::string op_type); NpuOpRunner();
explicit NpuOpRunner(std::string op_type, explicit NpuOpRunner(const std::string &op_type);
const std::vector<Tensor> &inputs = {}, NpuOpRunner(const std::string &op_type,
const std::vector<Tensor> &outputs = {}, const std::vector<Tensor> &inputs = {},
const NPUAttributeMap &attrs = {}); const std::vector<Tensor> &outputs = {},
const NPUAttributeMap &attrs = {});
// NOTE(zhiqiu): why forbid copy and operator= ? // NOTE(zhiqiu): why forbid copy and operator= ?
// Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner, // Since we will free the tensor_descs and data_buffers in the ~NpuOpRunner,
...@@ -53,12 +54,23 @@ class NpuOpRunner { ...@@ -53,12 +54,23 @@ class NpuOpRunner {
const std::string &Type(); const std::string &Type();
NpuOpRunner &SetType(const std::string &name);
NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr); NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr);
NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs); NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs);
NpuOpRunner &AddInput(const Tensor &tensor); NpuOpRunner &AddInput(const Tensor &tensor);
// NOTE(zhiqiu): CANN-5.0.2 support input tensors on host.
// Specifically, the tensor of shape, tensor of dims, etc, which are are small
// vector/list.
NpuOpRunner &AddInput(const Tensor &tensor, aclMemType mem_type);
NpuOpRunner &AddInput(std::vector<int32_t> &&dims);
NpuOpRunner &AddInput(std::vector<int64_t> &&dims);
NpuOpRunner &AddOutput(const Tensor &tensor); NpuOpRunner &AddOutput(const Tensor &tensor);
NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors); NpuOpRunner &AddInputs(const std::vector<Tensor> &tensors);
...@@ -82,7 +94,8 @@ class NpuOpRunner { ...@@ -82,7 +94,8 @@ class NpuOpRunner {
void Run(aclrtStream stream = nullptr) const; void Run(aclrtStream stream = nullptr) const;
private: private:
aclTensorDesc *CreateTensorDesc(Tensor tensor); aclTensorDesc *CreateTensorDesc(Tensor tensor,
aclMemType mem_type = ACL_MEMTYPE_DEVICE);
aclDataBuffer *CreateDataBuffer(Tensor tensor); aclDataBuffer *CreateDataBuffer(Tensor tensor);
private: private:
...@@ -91,6 +104,7 @@ class NpuOpRunner { ...@@ -91,6 +104,7 @@ class NpuOpRunner {
std::vector<aclDataBuffer *> output_buffers_; std::vector<aclDataBuffer *> output_buffers_;
std::vector<aclTensorDesc *> input_descs_; std::vector<aclTensorDesc *> input_descs_;
std::vector<aclTensorDesc *> output_descs_; std::vector<aclTensorDesc *> output_descs_;
std::vector<Tensor> host_tensors_;
aclopAttr *attr_{nullptr}; aclopAttr *attr_{nullptr};
}; };
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册