提交 8278f114 编写于 作者: Y yanghongtian

add subgraph compute and backends

上级 841061b1
......@@ -153,12 +153,12 @@ function(lite_cc_library TARGET)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS}
)
# MLU_DEPS ${args_MLU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
)
if (args_SHARED OR ARGS_shared)
cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} SHARED)
......@@ -204,7 +204,7 @@ function(lite_cc_binary TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
# MLU_DEPS ${args_MLU_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
......@@ -261,7 +261,7 @@ function(lite_cc_test TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
# MLU_DEPS ${args_MLU_DEPS}
)
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size
......@@ -443,8 +443,8 @@ function(add_kernel TARGET device level)
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
BM_DEPS ${args_BM_DEPS}
#MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -463,7 +463,7 @@ endif()
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -498,8 +498,8 @@ function(add_operator TARGET level)
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
BM_DEPS ${args_BM_DEPS}
#MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......
......@@ -68,7 +68,7 @@ if (WITH_TESTING)
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
BM_DEPS ${bm_kernels}
HW_ASCENND_NPU_DEPS ${hw_ascend_npu_kernels}
HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
MLU_DEPS ${mlu_kernels})
endif()
if(LITE_WITH_FPGA)
......@@ -110,7 +110,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels})
HWAscendNPU_DEPS ${hw_ascend_npu_kernels})
endif()
# for light api
......@@ -132,8 +132,7 @@ lite_cc_library(light_api SRCS light_api.cc
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
MLU_DEPS ${mlu_kernels})
HWAscendNPU_DEPS ${hw_ascend_npu_kernels})
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......@@ -152,8 +151,7 @@ if(WITH_TESTING)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
MLU_DEPS ${mlu_kernels}
HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -301,6 +299,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -338,8 +337,8 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
MLU_DEPS ${mlu_kernels}
HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
......
......@@ -8,3 +8,4 @@ add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm)
add_subdirectory(hw_ascend_npu)
if(NOT LITE_WITH_HW_ASCEND_NPU)
return()
endif()
lite_cc_library(runtime_hw_ascend_npu SRCS runtime.cc DEPS
${hw_ascend_npu_builder_libs}
${hw_ascend_npu_runtime_libs})
lite_cc_library(target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS
${hw_ascend_npu_builder_libs}
${hw_ascend_npu_runtime_libs})
lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS
${hw_ascend_npu_builder_libs}
${hw_ascend_npu_runtime_libs}
target_wrapper_hw_ascend_npu
runtime_hw_ascend_npu)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/hw_ascend_npu/device.h"
#include <map>
#include <string>
#include "ge/ge_api_types.h"
#include "lite/backends/hw_ascend_npu/runtime.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace hw_ascend_npu {
std::shared_ptr<HWAscendNPURuntime> Device::Build(
std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT
) {
VLOG(3) << "[HWAscendNPU] Build model";
// Build the IR graph to the om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
ge::ModelBufferData model;
std::map<std::string, std::string> build_options;
build_options.insert({ge::ir_option::EXEC_DISABLE_REUSED_MEMORY, "1"});
ge::graphStatus ret = aclgrphBuildModel(ir_graph, build_options, model);
if (ret != ge::GRAPH_SUCCESS) {
LOG(ERROR) << "[HWAscendNPU] Build model failed, error code: " << ret;
return nullptr;
}
std::shared_ptr<HWAscendNPURuntime> model_runtime(
new HWAscendNPURuntime(model.data, model.length));
CHECK(model_runtime != nullptr);
if (!model_runtime->model_loaded()) {
LOG(ERROR) << "[HWAscendNPU]: Can not create model runtime instance";
return nullptr;
}
VLOG(3) << "[HWAscendNPU]: Build done";
return model_runtime;
}
} // namespace hw_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "ge/ge_ir_build.h" // NOLINT
#include "lite/backends/hw_ascend_npu/runtime.h"
namespace paddle {
namespace lite {
namespace hw_ascend_npu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() {}
int freq_level() { return freq_level_; }
int framework_type() { return framework_type_; }
int model_type() { return model_type_; }
int device_type() { return device_type_; }
// Build the IR graph to om model, return a HWAscendNPURuntime instance to
// load om model and run inference.
std::shared_ptr<HWAscendNPURuntime> Build(
std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT
); // NOLINT
private:
int freq_level_{3};
int framework_type_{0};
int model_type_{0};
int device_type_{0};
};
} // namespace hw_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/hw_ascend_npu/runtime.h"
#include "lite/backends/hw_ascend_npu/target_wrapper.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace hw_ascend_npu {
HWAscendNPURuntime::HWAscendNPURuntime(
std::shared_ptr<uint8_t> model_buff_built, size_t model_buff_size) {
model_loaded_ = (0 == LoadModelFromMem(model_buff_built, model_buff_size));
}
HWAscendNPURuntime::~HWAscendNPURuntime() {
UnloadModel();
DestroyDesc();
DestroyInput();
DestroyOutput();
}
int HWAscendNPURuntime::LoadModelFromMem(
std::shared_ptr<uint8_t> model_buff_built, size_t model_buff_size) {
if (model_loaded_) {
LOG(ERROR) << "[HWAscendNPU]: Has already loaded a model";
return 0;
}
aclError ret = aclmdlQuerySizeFromMem(model_buff_built.get(),
model_buff_size,
&model_size_,
&model_weights_size_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can query size from a built model buffer, "
"error code: "
<< ret;
return ret;
}
ret = aclrtMalloc(&model_ptr_, model_size_, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model, "
"error code: "
<< ret;
return ret;
}
ret = aclrtMalloc(
&model_weights_ptr_, model_weights_size_, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model "
"weights, error code: "
<< ret;
return ret;
}
ret = aclmdlLoadFromMemWithMem(model_buff_built.get(),
model_buff_size,
&model_id_,
model_ptr_,
model_size_,
model_weights_ptr_,
model_weights_size_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can not Load model from memory, error code: "
<< ret;
return ret;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(ERROR) << "HWAscendNPU]: Can not create model descriptor.";
return ACL_ERROR_FAILURE;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can not get model descriptor from model, "
"error code: "
<< ret;
return ret;
}
return ret;
}
int HWAscendNPURuntime::CreateInput(const std::vector<DDim>& idims) {
if (itensors_ != nullptr) {
DestroyInput();
}
itensors_ = aclmdlCreateDataset();
if (itensors_ == nullptr) {
LOG(ERROR) << "[HWAscendNPU]: Can not create input dataset";
return ACL_ERROR_FAILURE;
}
for (auto& dim : idims) {
void* buff_dev_ptr = nullptr;
CHECK(ACL_ERROR_NONE == aclrtMalloc(&buff_dev_ptr,
dim.production(),
ACL_MEM_MALLOC_NORMAL_ONLY));
aclDataBuffer* input_data_buffer =
aclCreateDataBuffer(buff_dev_ptr, dim.production());
CHECK(input_data_buffer != nullptr);
CHECK(ACL_ERROR_NONE ==
aclmdlAddDatasetBuffer(itensors_, input_data_buffer));
}
return 0;
}
int HWAscendNPURuntime::CreateOutput(const std::vector<DDim>& odims) {
if (otensors_ != nullptr) {
DestroyOutput();
}
otensors_ = aclmdlCreateDataset();
if (otensors_ == nullptr) {
LOG(ERROR) << "[HWAscendNPU]: Can not create output dataset";
return ACL_ERROR_FAILURE;
}
for (auto& dim : odims) {
void* buff_dev_ptr = nullptr;
CHECK(ACL_ERROR_NONE == aclrtMalloc(&buff_dev_ptr,
dim.production(),
ACL_MEM_MALLOC_NORMAL_ONLY));
aclDataBuffer* output_data_buffer =
aclCreateDataBuffer(buff_dev_ptr, dim.production());
CHECK(output_data_buffer != nullptr);
CHECK(ACL_ERROR_NONE ==
aclmdlAddDatasetBuffer(otensors_, output_data_buffer));
}
return 0;
}
void HWAscendNPURuntime::UnloadModel() {
if (!model_loaded_) {
LOG(ERROR) << "[HWAscendNPU]: No model has been loaded";
return;
}
aclError ret = ACL_ERROR_NONE;
ret = aclmdlUnload(model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Unload a model, error code: " << ret;
return;
}
if (model_ptr_) {
aclrtFree(model_ptr_);
model_ptr_ = nullptr;
}
if (model_weights_ptr_) {
aclrtFree(model_weights_ptr_);
model_weights_ptr_ = nullptr;
}
model_loaded_ = false;
}
void HWAscendNPURuntime::DestroyDesc() {
if (model_desc_) {
(void)aclmdlDestroyDesc(model_desc_);
model_desc_ = nullptr;
}
}
void HWAscendNPURuntime::DestroyInput() {
if (itensors_ == nullptr) {
return;
}
size_t buf_num = aclmdlGetDatasetNumBuffers(itensors_);
for (size_t i = 0; i < buf_num; ++i) {
aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i);
aclDestroyDataBuffer(data_buffer);
}
aclmdlDestroyDataset(itensors_);
itensors_ = nullptr;
}
void HWAscendNPURuntime::DestroyOutput() {
if (otensors_ == nullptr) {
return;
}
size_t buf_num = aclmdlGetDatasetNumBuffers(otensors_);
for (size_t i = 0; i < buf_num; ++i) {
aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(otensors_, i);
aclDestroyDataBuffer(data_buffer);
}
aclmdlDestroyDataset(otensors_);
otensors_ = nullptr;
}
int HWAscendNPURuntime::SetInput(const std::vector<Tensor*>& itensors,
const std::vector<DDim>& idims) {
CHECK(itensors.size() == idims.size());
size_t input_tensor_num = itensors.size();
for (size_t i = 0; i < input_tensor_num; ++i) {
CHECK(itensors[i]->memory_size() == idims[i].production());
}
size_t num_buffers_in_dataset = aclmdlGetDatasetNumBuffers(itensors_);
if (num_buffers_in_dataset != input_tensor_num) {
if (0 != CreateInput(idims)) {
return -1;
}
} else {
bool need_to_create_input = false;
for (size_t i = 0; i < num_buffers_in_dataset; ++i) {
aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i);
int64_t buf_size = aclGetDataBufferSize(data_buffer);
if (buf_size != idims[i].production()) {
need_to_create_input = true;
}
}
if (need_to_create_input && 0 != CreateInput(idims)) {
return -1;
}
}
// copy input data from host to device
for (size_t i = 0; i < input_tensor_num; ++i) {
aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i);
void* buf_dev_ptr = aclGetDataBufferAddr(data_buffer);
TargetWrapperHWAscendNPU::MemcpySync(buf_dev_ptr,
itensors[i]->raw_data(),
itensors[i]->memory_size(),
IoDirection::HtoD);
}
return 0;
}
void HWAscendNPURuntime::GetOutput(const std::vector<Tensor*>* otensors_ptr) {
CHECK(otensors_ptr != nullptr);
size_t num_output = aclmdlGetDatasetNumBuffers(otensors_);
const std::vector<Tensor*> otensors = *otensors_ptr;
CHECK(num_output == otensors.size());
for (size_t i = 0; i < num_output; ++i) {
aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(otensors_, i);
TargetWrapperHWAscendNPU::MemcpySync(otensors[i]->raw_data(),
aclGetDataBufferAddr(data_buffer),
aclGetDataBufferSize(data_buffer),
IoDirection::DtoH);
}
}
int HWAscendNPURuntime::Process() {
aclError ret = aclmdlExecute(model_id_, itensors_, otensors_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Execute model failed, model_id: " << model_id_
<< ", error code: " << ret;
}
return ret;
}
int HWAscendNPURuntime::GetModelIOTensorDim(std::vector<TensorDesc>* idims,
std::vector<TensorDesc>* odims) {
aclError ret = ACL_ERROR_NONE;
size_t num_inputs = aclmdlGetNumInputs(model_desc_);
size_t num_outputs = aclmdlGetNumOutputs(model_desc_);
for (size_t i = 0; i < num_inputs; ++i) {
aclmdlIODims dims;
if (ret != aclmdlGetInputDims(model_desc_, i, &dims)) {
LOG(ERROR) << "[HWAscendNPU]: Get input dims failed, index: " << i;
return ret;
}
aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
aclFormat format = aclmdlGetInputFormat(model_desc_, i);
idims->push_back(TensorDesc(data_type, dims, format));
}
for (size_t i = 0; i < num_outputs; ++i) {
aclmdlIODims dims;
if (ret != aclmdlGetOutputDims(model_desc_, i, &dims)) {
LOG(ERROR) << "[HWAscendNPU]: Get output dims failed, index: " << i;
return ret;
}
aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
aclFormat format = aclmdlGetOutputFormat(model_desc_, i);
odims->push_back(TensorDesc(data_type, dims, format));
}
return 0;
}
} // namespace hw_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// hw_ascend_npu runtime library
#include <acl/acl.h>
#include <acl/tensor.h>
#include <memory>
#include <vector>
#include "lite/core/tensor.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace hw_ascend_npu {
class TensorDesc {
public:
TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
tensor_desc_ =
aclCreateTensorDesc(data_type, dims.dimCount, dims.dims, format);
CHECK(tensor_desc_ != nullptr);
aclSetTensorDescName(tensor_desc_, dims.name);
if (format == ACL_FORMAT_NHWC) {
dim_order[1] = 3;
dim_order[2] = 1;
dim_order[3] = 2;
}
}
~TensorDesc() {
if (tensor_desc_ != nullptr) {
aclDestroyTensorDesc(tensor_desc_);
tensor_desc_ = nullptr;
}
}
uint32_t GetNumber() const {
return static_cast<uint32_t>(
aclGetTensorDescDim(tensor_desc_, dim_order[0]));
}
uint32_t GetChannel() const {
return static_cast<uint32_t>(
aclGetTensorDescDim(tensor_desc_, dim_order[1]));
}
uint32_t GetHeight() const {
return static_cast<uint32_t>(
aclGetTensorDescDim(tensor_desc_, dim_order[2]));
}
uint32_t GetWidth() const {
return static_cast<uint32_t>(
aclGetTensorDescDim(tensor_desc_, dim_order[3]));
}
const aclTensorDesc& GetTensorDesc() const { return *tensor_desc_; }
private:
aclTensorDesc* tensor_desc_{nullptr};
// n c h w order, default to ACL_FORMAT_NCHW
std::vector<uint32_t> dim_order{0, 1, 2, 3};
};
class HWAscendNPURuntime {
public:
HWAscendNPURuntime(std::shared_ptr<uint8_t> model_buff_built,
size_t model_buff_size);
~HWAscendNPURuntime();
int SetInput(const std::vector<Tensor*>& itensors,
const std::vector<DDim>& idims);
void GetOutput(const std::vector<Tensor*>* otensors_ptr);
int Process();
bool model_loaded() const { return model_loaded_; }
int CreateInput(const std::vector<DDim>& idims);
int CreateOutput(const std::vector<DDim>& odims);
int GetModelIOTensorDim(std::vector<TensorDesc>* idims,
std::vector<TensorDesc>* odims);
private:
int LoadModelFromMem(std::shared_ptr<uint8_t> model_buff_built,
size_t model_buff_size);
void UnloadModel();
void DestroyDesc();
void DestroyInput();
void DestroyOutput();
private:
aclmdlDataset* itensors_{nullptr};
aclmdlDataset* otensors_{nullptr};
uint32_t model_id_{0};
void* model_ptr_{nullptr};
void* model_weights_ptr_{nullptr};
size_t model_size_{0};
size_t model_weights_size_{0};
bool model_loaded_{false};
aclmdlDesc* model_desc_;
};
} // namespace hw_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/hw_ascend_npu/target_wrapper.h"
#include <acl/acl.h>
#include <glog/logging.h>
namespace paddle {
namespace lite {
void* TargetWrapperHWAscendNPU::Malloc(size_t size) {
void* ptr{nullptr};
if (ACL_ERROR_NONE != aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_NORMAL_ONLY)) {
LOG(ERROR) << "[HWAscendNPU]: Allocate memory from device failed";
ptr = nullptr;
}
return ptr;
}
void TargetWrapperHWAscendNPU::Free(void* ptr) { aclrtFree(ptr); }
void TargetWrapperHWAscendNPU::MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir) {
switch (dir) {
case IoDirection::HtoD:
aclrtMemcpy(dst, size, src, size, ACL_MEMCPY_HOST_TO_DEVICE);
break;
case IoDirection::DtoH:
aclrtMemcpy(dst, size, src, size, ACL_MEMCPY_DEVICE_TO_HOST);
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
template <>
class TargetWrapper<TARGET(kHWAscendNPU)> {
public:
static size_t num_devices() { return 4; }
static size_t maximum_stream() { return 0; }
static void* Malloc(size_t size);
static void Free(void* ptr);
static void MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir);
};
using TargetWrapperHWAscendNPU = TargetWrapper<TARGET(kHWAscendNPU)>;
} // namespace lite
} // namespace paddle
......@@ -535,6 +535,7 @@ class ContextScheduler {
.As<HWAscendNPUContext>()
.CopySharedTo(&ctx->As<HWAscendNPUContext>());
break;
#endif
#ifdef LITE_WITH_MLU
case TARGET(kMLU): {
int dev_id = TargetWrapper<TargetType::kMLU>::GetCurDevice();
......
......@@ -17,7 +17,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
HWAscendNPU_DEPS ${hw_ascend_npu_kernels}
FPGA_DEPS ${fpga_kernels}
EXCLUDE_COMPILE_DEPS "ON"
ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......
add_subdirectory(bridges)
add_kernel(subgraph_compute_hw_ascend_npu HWAscendNPU basic SRCS subgraph_compute.cc DEPS
${lite_kernel_deps} device_hw_ascend_npu subgraph_bridge_engine ${hw_ascend_npu_subgraph_bridges})
......@@ -2,11 +2,15 @@ if (NOT LITE_WITH_HW_ASCEND_NPU)
return()
endif()
message(STATUS "======compile hw_ascend_npu bridges, ${ascend_builder_libs}")
message(STATUS "======compile hw_ascend_npu bridges, ${hw_ascend_npu_builder_libs}")
lite_cc_library(subgraph_bridge_utility_hw_ascend_npu
SRCS utility.cc
DEPS ${hw_ascend_npu_builder_libs} tensor)
lite_cc_library(subgraph_bridge_graph_hw_ascend_npu
SRCS graph.cc
DEPS ${ascend_builder_libs})
DEPS ${hw_ascend_npu_builder_libs} subgraph_bridge_utility_hw_ascend_npu)
set(hw_ascend_npu_subgraph_bridges
subgraph_bridge_graph_hw_ascend_npu
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace hw_ascend_npu {
template <typename ActType>
int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HWAscendNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindTensor(x_name);
auto out_name = op_info->Output("Out").front();
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
// Act node
auto act_node = graph->template Add<ActType>(out_name);
auto act_op = act_node->template data<ActType>();
act_op->set_input_x(*x_node->data());
return SUCCESS;
}
template <>
int ActConverter<ge::op::Activation>(void* ctx,
OpLite* op,
KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HWAscendNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto out_name = op_info->Output("Out").front();
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
// Act node
auto act_node = graph->template Add<ge::op::Activation>(out_name);
auto act_op = act_node->template data<ge::op::Activation>();
act_op->set_input_x(*x_node->data());
// TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
// clipped_relu etc.
act_op->set_attr_mode(CvtActMode(op_type));
if (op_type == "relu_clipped") {
auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
act_op->set_attr_coef(Relu_clipped_coef);
} else if (op_type == "relu6") {
float Relu_clipped_coef = 6.f;
act_op->set_attr_coef(Relu_clipped_coef);
} else if (op_type == "leaky_relu") {
auto alpha = op_info->GetAttr<float>("alpha");
act_op->set_attr_negative_slope(alpha);
} else if (op_type == "hard_sigmoid") {
auto slope = op_info->GetAttr<float>("slope");
auto offset = op_info->GetAttr<float>("offset");
act_op->set_attr_negative_slope(slope);
act_op->set_attr_coef(offset);
}
return SUCCESS;
}
} // namespace hw_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(
sigmoid,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
relu,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
tanh,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
relu_clipped,
kNPU,
paddle::lite::subgraph::npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
relu6,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
leaky_relu,
kHWAscendNPU,
paddle::lite::subgraph::npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
abs,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
softsign,
kNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
softplus,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
hard_sigmoid,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Activation>);
REGISTER_SUBGRAPH_BRIDGE(
log,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Log>);
REGISTER_SUBGRAPH_BRIDGE(
square,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Square>);
REGISTER_SUBGRAPH_BRIDGE(
sqrt,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ActConverter<ge::Sqrt>);
......@@ -14,8 +14,8 @@
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
/// reference from opp package
#include <all_ops.h>
#include <utility>
#include "lite/kernels/hw_ascend_npu/utility.h"
namespace paddle {
namespace lite {
......@@ -49,8 +49,9 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
PrecisionType precision = tensor.precision();
if (tensor.persistable()) {
// Const node
node = Add<ge::Const>(name, precision, layout);
node->data<ge::Const>()->set_attr_value(CvtTensor(tensor, shape, layout));
node = Add<ge::op::Const>(name, precision, layout);
node->data<ge::op::Const>()->set_attr_value(
CvtTensor(tensor, shape, layout));
} else {
// Data node
node = Add(name, shape, precision, layout);
......@@ -63,10 +64,10 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
std::vector<int64_t> shape,
PrecisionType precision,
DataLayoutType layout) {
auto node = Add<ge::Data>(name, precision, layout);
auto node = Add<ge::op::Data>(name, precision, layout);
ge::TensorDesc desc(
ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
node->data<ge::Data>()->update_input_desc_x(desc);
node->data<ge::op::Data>()->update_input_desc_data(desc);
return node;
}
......
......@@ -14,13 +14,15 @@
#pragma once
// reference from atc package
#include <all_ops.h>
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
// reference from atc package
#include "graph/operators.h"
#include "graph/operator.h"
#include "graph/operator_reg.h"
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/hw_ascend_npu/utility.h"
#include <utility>
#include <algorithm
namespace paddle {
namespace lite {
namespace subgraph {
namespace hw_ascend_npu {
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname) {
auto iarg_names = op_info->input_argnames();
if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
iarg_names.end()) {
auto inputs = op_info->Input(argname);
if (inputs.empty()) {
return false;
}
auto var_name = inputs.front();
auto var = scope->FindVar(var_name);
return var != nullptr;
} else {
return false;
}
}
ge::DataType CvtPrecisionType(PrecisionType itype) {
ge::DataType otype = ge::DT_FLOAT;
switch (itype) {
case PRECISION(kFloat):
otype = ge::DT_FLOAT;
break;
case PRECISION(kInt8):
otype = ge::DT_INT8;
break;
case PRECISION(kInt32):
otype = ge::DT_INT32;
break;
case PRECISION(kFP16):
otype = ge::DT_FLOAT16;
break;
case PRECISION(kBool):
otype = ge::DT_BOOL;
break;
case PRECISION(kInt64):
otype = ge::DT_INT64;
break;
case PRECISION(kInt16):
otype = ge::DT_INT16;
break;
default:
LOG(FATAL) << "[HW_ASCEND_NPU] Can not convert precision type("
<< PrecisionToStr(itype) << ") from Lite to HW_ASCEND_NPU";
break;
}
return otype;
}
ge::Format CvtDataLayoutType(DataLayoutType itype) {
ge::Format otype = ge::FORMAT_NCHW;
switch (itype) {
case DATALAYOUT(kNCHW):
otype = ge::FORMAT_NCHW;
break;
case DATALAYOUT(kNHWC):
otype = ge::FORMAT_NHWC;
break;
// TODO(yanghongtian): support more data layout type
default:
LOG(FATAL) << "[HW_ASCEND_NPU] Can not convert data layout type("
<< DataLayoutToStr(itype) << ") from Lite to HW_ASCEND_NPU";
break;
}
return otype;
}
std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape) {
CHECK(in_shape.size() <= 4 && in_shape.size() > 0)
<< "[HW_ASCEND_NPU] The size of in_shape is invalid: " << in_shape.size();
// Padding the shape to 4-dimensions(NCHW)
std::vector<int64_t> out_shape(4, 1);
std::copy(in_shape.begin(),
in_shape.end(),
out_shape.begin() + 4 - in_shape.size());
return out_shape;
}
std::vector<int64_t> CvtShape(const DDim& in_dims) {
return CvtShape(in_dims.Vectorize());
}
ge::Tensor CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape,
DataLayoutType in_layout) {
PrecisionType in_precision = in_tensor.precision();
auto in_size = in_tensor.dims().production();
auto in_shape = in_tensor.dims().Vectorize();
if (out_shape.empty()) {
out_shape = in_shape;
}
ge::TensorDesc out_desc(ge::Shape(out_shape),
CvtDataLayoutType(in_layout),
CvtPrecisionType(in_precision));
auto out_size = out_desc.GetShape().GetShapeSize();
CHECK_EQ(out_size, in_size);
ge::Tensor out_tensor;
out_tensor.SetTensorDesc(out_desc);
out_tensor.SetData(reinterpret_cast<const uint8_t*>(in_tensor.raw_data()),
in_tensor.memory_size());
return std::move(out_tensor);
}
int CvtActMode(const std::string& act_type) {
// based on the nonlinear_fuc_ops.h in OPP (line 210)
// default to Relu
int act_mode = 1;
if (act_type == "sigmoid") {
act_mode = 0;
} else if (act_type == "relu") {
act_mode = 1;
} else if (act_type == "tanh") {
act_mode = 2;
} else if (act_type == "relu_clipped" || act_type == "relu6") {
act_mode = 3;
} else if (act_type == "elu") {
act_mode = 4;
} else if (act_type == "leaky_relu") {
act_mode = 5;
} else if (act_type == "abs") {
act_mode = 6;
} else if (act_type == "relu1") {
// TODO(yanghongtian): check hw_ascend_npu supports relu1 or not.
act_mode = 7;
} else if (act_type == "softsign") {
act_mode = 8;
} else if (act_type == "softplus") {
act_mode = 9;
} else {
// TODO(yanghongtian): support more activation mode
LOG(FATAL) << "[NPU] Unsupported activation type " << act_type;
}
return act_mode;
}
} // namespace hw_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <graph/tensor.h>
#include <graph/types.h>
#include <functional>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/utils/macros.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace hw_ascend_npu {
// Type/tensor converters for converting Paddle type/tensor to hw ascend npu IR
// type
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname);
ge::DataType CvtPrecisionType(PrecisionType itype);
ge::Format CvtDataLayoutType(DataLayoutType itype);
// Padding the shape to 4-dimensions(NCHW) for HW_ASCEND_NPU
std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape);
std::vector<int64_t> CvtShape(const DDim& in_dims);
ge::Tensor CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape = {},
DataLayoutType in_layout = DATALAYOUT(kNCHW));
int CvtActMode(std::string act_type);
} // namespace hw_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/hw_ascend_npu/subgraph_compute.h"
#include <sys/time.h>
#include <time.h>
#include <utility>
#include "lite/backends/hw_ascend_npu/device.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
#include "lite/kernels/npu/bridges/paddle_use_bridges.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace hw_ascend_npu {
int SubgraphEngine::BuildDeviceProgram() {
int status = 0;
// Convert all of ops and their input vars and weights and added into
// the HWAscendNPU IR graph
subgraph::hw_ascend_npu::Graph graph;
const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) {
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
op->CheckShape();
op->InferShape();
std::string op_type = op->op_info()->Type();
if (!bridges.Exists(op_type, TARGET(kHWAscendNPU))) {
return subgraph::FAILED;
}
auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kHWAscendNPU))(
reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) {
return subgraph::FAILED;
}
}
// Collect the valid input and output nodes in the HiAI IR graph and update
// the input and output names
device_inames_.clear();
device_onames_.clear();
std::vector<ge::Operator> device_inodes;
std::vector<ge::Operator> device_onodes;
for (auto& input_name : input_names_) {
if (graph.Has(input_name)) {
if (graph.Get(input_name)->is_data()) {
device_inodes.push_back(*graph.Get(input_name)->data());
device_inames_.push_back(input_name);
} else {
LOG(WARNING) << "[HWAscendNPU] Input node " << input_name
<< " is ignored because it is not a data node.";
}
} else {
LOG(WARNING) << "[HWAscendNPU] Input node " << input_name
<< " is ignored because it does not exist.";
}
}
for (auto& output_name : output_names_) {
if (graph.Has(output_name)) {
device_onodes.push_back(*graph.Get(output_name)->data());
device_onames_.push_back(output_name);
} else {
LOG(WARNING) << "[HWAscendNPU] Output node " << output_name
<< " is ignored because it does not exist.";
}
}
CHECK(!device_inames_.empty())
<< "[HWAscendNPU] No input nodes found for building NPU model";
CHECK(!device_onames_.empty())
<< "[HWAscendNPU] No output nodes found for building NPU model";
// Build the IR graph to om model as the device program
if (device_program_map_.count(inputs_shape_) > 0) {
return status;
}
auto device_client =
lite::hw_ascend_npu::Device::Global().Build(device_inodes, device_onodes);
if (device_client == nullptr) {
LOG(WARNING) << "[HWAscendNPU] Build model failed!";
return subgraph::FAILED;
}
auto device_program = std::make_shared<device_program_t>(device_client);
device_program_map_[inputs_shape_] = device_program;
// Query and check the dimensions of valid input and output tensors
std::vector<TensorDesc> device_idims, device_odims;
if (device_program->client->GetModelIOTensorDim(&device_idims,
&device_odims) != 0) {
LOG(WARNING) << "[HWAscendNPU] Get the dimensions of input and output "
"tensors failed!";
return subgraph::FAILED;
}
device_program->device_idims = device_idims;
device_program->device_odims = device_odims;
CHECK_EQ(device_idims.size(), device_inames_.size());
CHECK_EQ(device_odims.size(), device_onames_.size());
origin_idims_.resize(device_inames_.size());
origin_itensors_.resize(device_inames_.size());
origin_odims_.resize(device_onames_.size());
origin_otensors_.resize(device_onames_.size());
for (size_t i = 0; i < device_inames_.size(); i++) {
auto node = graph.Get(device_inames_[i]);
auto precision = node->precision();
auto layout = node->layout();
origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]);
CHECK(origin_itensors_[i]);
origin_idims_[i] = origin_itensors_[i]->dims();
VLOG(3) << "[HWAscendNPU] Inputs[" << i << "] name: " << device_inames_[i]
<< " precision: " << PrecisionToStr(precision)
<< " layout: " << DataLayoutToStr(layout) << " dims: {"
<< device_idims[i].GetNumber() << ","
<< device_idims[i].GetChannel() << ","
<< device_idims[i].GetHeight() << "," << device_idims[i].GetWidth()
<< "}";
// Prepare the device input tensors
CHECK_EQ(origin_idims_[i].production(),
device_idims[i].GetNumber() * device_idims[i].GetChannel() *
device_idims[i].GetHeight() * device_idims[i].GetWidth());
}
device_program->origin_idims = origin_idims_;
for (size_t i = 0; i < device_onames_.size(); i++) {
auto node = graph.Get(device_onames_[i]);
auto precision = node->precision();
auto layout = node->layout();
origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]);
CHECK(origin_otensors_[i]);
origin_odims_[i] = origin_otensors_[i]->dims();
VLOG(3) << "[HWAscendNPU] Outputs[" << i << "] name: " << device_onames_[i]
<< " precision: " << PrecisionToStr(precision)
<< " layout: " << DataLayoutToStr(layout) << " dims: {"
<< device_odims[i].GetNumber() << ","
<< device_odims[i].GetChannel() << ","
<< device_odims[i].GetHeight() << "," << device_odims[i].GetWidth()
<< "}";
// Prepare the device output tensors
switch (precision) {
case PRECISION(kFloat):
origin_otensors_[i]->mutable_data<float>();
break;
case PRECISION(kBool):
origin_otensors_[i]->mutable_data<bool>();
break;
case PRECISION(kInt8):
origin_otensors_[i]->mutable_data<int8_t>();
break;
case PRECISION(kInt16):
origin_otensors_[i]->mutable_data<int16_t>();
break;
case PRECISION(kInt32):
origin_otensors_[i]->mutable_data<int32_t>();
break;
case PRECISION(kInt64):
origin_otensors_[i]->mutable_data<int64_t>();
break;
default:
LOG(FATAL) << "[HWAscendNPU] " << device_onames_[i]
<< " can't mutable data with precision type "
<< PrecisionToStr(precision);
break;
}
device_program->origin_odims = origin_odims_;
CHECK_EQ(origin_odims_[i].production(),
device_odims[i].GetNumber() * device_odims[i].GetChannel() *
device_odims[i].GetHeight() * device_odims[i].GetWidth());
}
return status;
}
int SubgraphEngine::LaunchDeviceProgram() {
// Copy the data of origin input tensors to the buffer of input HWAscendNPU
// tensors
auto device_program = device_program_map_[inputs_shape_];
int ret = 0;
ret = device_program->client->SetInput(origin_itensors_,
device_program->origin_idims);
if (ret != 0) {
return ret;
}
device_program->client->CreateOutput(device_program->origin_odims);
// run inference
auto GetCurrentUS = []() -> double {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
};
auto start_time = GetCurrentUS();
CHECK_EQ(device_program->client->Process(), 0);
VLOG(3) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time
<< " us";
device_program->client->GetOutput(&origin_otensors_);
return 0;
}
bool SubgraphEngine::InputShapeChanged() {
std::vector<std::vector<int64_t>> new_shape;
for (auto origin_itensor : origin_itensors_) {
new_shape.push_back(origin_itensor->dims().Vectorize());
}
inputs_shape_ = new_shape;
if (device_program_map_.count(inputs_shape_) > 0) {
return false;
}
return true;
}
void SubgraphCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(ctx_.get(),
param.sub_block_idx,
param.sub_block_desc,
param.input_data_names,
param.output_data_names,
param.scope));
CHECK(engine_);
engine_->Build();
}
void SubgraphCompute::Run() {
CHECK(engine_);
engine_->Launch();
}
} // namespace hw_ascend_npu
} // namespace kernels
} // namespace lite
} // namespace paddle
REGISTER_LITE_KERNEL(subgraph,
kHWAscendNPU,
kAny,
kNCHW,
paddle::lite::kernels::hw_ascend_npu::SubgraphCompute,
def)
.BindInput("Inputs",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
.BindOutput("Outputs",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
.Finalize();
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <graph/tensor.h>
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/hw_ascend_npu/runtime.h"
#include "lite/core/kernel.h"
#include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h"
using HWAscendNPURuntime = paddle::lite::hw_ascend_npu::HWAscendNPURuntime;
using TensorDesc = paddle::lite::hw_ascend_npu::TensorDesc;
namespace paddle {
namespace lite {
namespace kernels {
namespace hw_ascend_npu {
class SubgraphEngine : public subgraph::Engine {
public:
SubgraphEngine(KernelContext *ctx,
int block_idx,
cpp::BlockDesc *block_desc,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
Scope *scope)
: subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {}
struct device_program_t {
explicit device_program_t(std::shared_ptr<HWAscendNPURuntime> _client)
: client(_client) {}
std::shared_ptr<HWAscendNPURuntime> client{nullptr};
std::vector<DDim> origin_idims{};
std::vector<DDim> origin_odims{};
std::vector<TensorDesc> device_idims{};
std::vector<TensorDesc> device_odims{};
};
protected:
int BuildDeviceProgram() override;
int LaunchDeviceProgram() override;
bool InputShapeChanged() override;
std::vector<std::vector<int64_t>> inputs_shape_{};
std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<device_program_t>>
device_program_map_{};
std::vector<std::string> device_inames_{};
std::vector<std::string> device_onames_{};
};
class SubgraphCompute
: public KernelLite<TARGET(kHWAscendNPU), PRECISION(kAny)> {
public:
using param_t = operators::SubgraphParam;
void PrepareForRun() override;
void Run() override;
virtual ~SubgraphCompute() = default;
private:
std::unique_ptr<SubgraphEngine> engine_;
};
} // namespace hw_ascend_npu
} // namespace kernels
} // namespace lite
} // namespace paddle
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU)
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_HW_ASCEND_NPU)
return()
endif()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册