From 8278f1146c8f5b94e014503825386db8122d19f4 Mon Sep 17 00:00:00 2001 From: yanghongtian Date: Wed, 15 Apr 2020 19:01:08 +0800 Subject: [PATCH] add subgraph compute and backends --- cmake/lite.cmake | 20 +- lite/api/CMakeLists.txt | 13 +- lite/backends/CMakeLists.txt | 1 + lite/backends/hw_ascend_npu/CMakeLists.txt | 14 + lite/backends/hw_ascend_npu/device.cc | 58 ++++ lite/backends/hw_ascend_npu/device.h | 56 ++++ lite/backends/hw_ascend_npu/runtime.cc | 296 ++++++++++++++++++ lite/backends/hw_ascend_npu/runtime.h | 109 +++++++ lite/backends/hw_ascend_npu/target_wrapper.cc | 50 +++ lite/backends/hw_ascend_npu/target_wrapper.h | 38 +++ lite/core/context.h | 1 + lite/gen_code/CMakeLists.txt | 2 +- lite/kernels/hw_ascend_npu/CMakeLists.txt | 3 + .../hw_ascend_npu/bridges/CMakeLists.txt | 8 +- lite/kernels/hw_ascend_npu/bridges/act_op.cc | 164 ++++++++++ lite/kernels/hw_ascend_npu/bridges/graph.cc | 11 +- lite/kernels/hw_ascend_npu/bridges/graph.h | 6 +- lite/kernels/hw_ascend_npu/bridges/utility.cc | 163 ++++++++++ lite/kernels/hw_ascend_npu/bridges/utility.h | 54 ++++ .../kernels/hw_ascend_npu/subgraph_compute.cc | 260 +++++++++++++++ lite/kernels/hw_ascend_npu/subgraph_compute.h | 84 +++++ lite/kernels/npu/bridges/CMakeLists.txt | 2 +- 22 files changed, 1385 insertions(+), 28 deletions(-) create mode 100644 lite/backends/hw_ascend_npu/CMakeLists.txt create mode 100644 lite/backends/hw_ascend_npu/device.cc create mode 100644 lite/backends/hw_ascend_npu/device.h create mode 100644 lite/backends/hw_ascend_npu/runtime.cc create mode 100644 lite/backends/hw_ascend_npu/runtime.h create mode 100644 lite/backends/hw_ascend_npu/target_wrapper.cc create mode 100644 lite/backends/hw_ascend_npu/target_wrapper.h create mode 100644 lite/kernels/hw_ascend_npu/bridges/act_op.cc create mode 100644 lite/kernels/hw_ascend_npu/bridges/utility.cc create mode 100644 lite/kernels/hw_ascend_npu/bridges/utility.h create mode 100644 lite/kernels/hw_ascend_npu/subgraph_compute.cc create mode 100644 lite/kernels/hw_ascend_npu/subgraph_compute.h diff --git a/cmake/lite.cmake b/cmake/lite.cmake index 11cc446d44..dc9d1fb9fc 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -153,12 +153,12 @@ function(lite_cc_library TARGET) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} - MLU_DEPS ${args_MLU_DEPS} - ) + # MLU_DEPS ${args_MLU_DEPS} + HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} + ) if (args_SHARED OR ARGS_shared) cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} SHARED) @@ -204,7 +204,7 @@ function(lite_cc_binary TARGET) LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${CV_DEPS} - MLU_DEPS ${args_MLU_DEPS} + # MLU_DEPS ${args_MLU_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) @@ -261,7 +261,7 @@ function(lite_cc_test TARGET) LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${args_CV_DEPS} - MLU_DEPS ${args_MLU_DEPS} + # MLU_DEPS ${args_MLU_DEPS} ) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) # strip binary target to reduce size @@ -443,8 +443,8 @@ function(add_kernel TARGET device level) NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} - BM_DEPS ${args_BM_DEPS} - MLU_DEPS ${args_MLU_DEPS} + BM_DEPS ${args_BM_DEPS} + #MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -463,7 +463,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -498,8 +498,8 @@ function(add_operator TARGET level) NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} - BM_DEPS ${args_BM_DEPS} - MLU_DEPS ${args_MLU_DEPS} + BM_DEPS ${args_BM_DEPS} + #MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 9f1803c9ad..579b8a3ca4 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -68,7 +68,7 @@ if (WITH_TESTING) X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} BM_DEPS ${bm_kernels} - HW_ASCENND_NPU_DEPS ${hw_ascend_npu_kernels} + HWAscendNPU_DEPS ${hw_ascend_npu_kernels} MLU_DEPS ${mlu_kernels}) endif() if(LITE_WITH_FPGA) @@ -110,7 +110,7 @@ if (NOT LITE_ON_TINY_PUBLISH) BM_DEPS ${bm_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} - HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}) + HWAscendNPU_DEPS ${hw_ascend_npu_kernels}) endif() # for light api @@ -132,8 +132,7 @@ lite_cc_library(light_api SRCS light_api.cc CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} - HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels} - MLU_DEPS ${mlu_kernels}) + HWAscendNPU_DEPS ${hw_ascend_npu_kernels}) include(ExternalProject) set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING @@ -152,8 +151,7 @@ if(WITH_TESTING) CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} - HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels} - MLU_DEPS ${mlu_kernels} + HWAscendNPU_DEPS ${hw_ascend_npu_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -301,6 +299,7 @@ lite_cc_test(test_apis SRCS apis_test.cc FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} + HWAscendNPU_DEPS ${hw_ascend_npu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -338,8 +337,8 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} - HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels} MLU_DEPS ${mlu_kernels} + HWAscendNPU_DEPS ${hw_ascend_npu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) if (WITH_TESTING) add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index fb459ae362..912c9beb79 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -8,3 +8,4 @@ add_subdirectory(npu) add_subdirectory(xpu) add_subdirectory(mlu) add_subdirectory(bm) +add_subdirectory(hw_ascend_npu) diff --git a/lite/backends/hw_ascend_npu/CMakeLists.txt b/lite/backends/hw_ascend_npu/CMakeLists.txt new file mode 100644 index 0000000000..a412e22884 --- /dev/null +++ b/lite/backends/hw_ascend_npu/CMakeLists.txt @@ -0,0 +1,14 @@ +if(NOT LITE_WITH_HW_ASCEND_NPU) + return() +endif() +lite_cc_library(runtime_hw_ascend_npu SRCS runtime.cc DEPS + ${hw_ascend_npu_builder_libs} + ${hw_ascend_npu_runtime_libs}) +lite_cc_library(target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS + ${hw_ascend_npu_builder_libs} + ${hw_ascend_npu_runtime_libs}) +lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS + ${hw_ascend_npu_builder_libs} + ${hw_ascend_npu_runtime_libs} + target_wrapper_hw_ascend_npu + runtime_hw_ascend_npu) diff --git a/lite/backends/hw_ascend_npu/device.cc b/lite/backends/hw_ascend_npu/device.cc new file mode 100644 index 0000000000..eb2ca933ec --- /dev/null +++ b/lite/backends/hw_ascend_npu/device.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/hw_ascend_npu/device.h" +#include +#include +#include "ge/ge_api_types.h" +#include "lite/backends/hw_ascend_npu/runtime.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace hw_ascend_npu { +std::shared_ptr Device::Build( + std::vector& input_nodes, // NOLINT + std::vector& output_nodes // NOLINT + ) { + VLOG(3) << "[HWAscendNPU] Build model"; + // Build the IR graph to the om model + ge::Graph ir_graph("graph"); + ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes); + ge::ModelBufferData model; + + std::map build_options; + build_options.insert({ge::ir_option::EXEC_DISABLE_REUSED_MEMORY, "1"}); + + ge::graphStatus ret = aclgrphBuildModel(ir_graph, build_options, model); + + if (ret != ge::GRAPH_SUCCESS) { + LOG(ERROR) << "[HWAscendNPU] Build model failed, error code: " << ret; + return nullptr; + } + + std::shared_ptr model_runtime( + new HWAscendNPURuntime(model.data, model.length)); + CHECK(model_runtime != nullptr); + if (!model_runtime->model_loaded()) { + LOG(ERROR) << "[HWAscendNPU]: Can not create model runtime instance"; + return nullptr; + } + VLOG(3) << "[HWAscendNPU]: Build done"; + return model_runtime; +} + +} // namespace hw_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/hw_ascend_npu/device.h b/lite/backends/hw_ascend_npu/device.h new file mode 100644 index 0000000000..ee820ead2b --- /dev/null +++ b/lite/backends/hw_ascend_npu/device.h @@ -0,0 +1,56 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "ge/ge_ir_build.h" // NOLINT +#include "lite/backends/hw_ascend_npu/runtime.h" +namespace paddle { +namespace lite { +namespace hw_ascend_npu { + +class Device { + public: + static Device& Global() { + static Device x; + return x; + } + Device() {} + + int freq_level() { return freq_level_; } + int framework_type() { return framework_type_; } + int model_type() { return model_type_; } + int device_type() { return device_type_; } + + // Build the IR graph to om model, return a HWAscendNPURuntime instance to + // load om model and run inference. + std::shared_ptr Build( + std::vector& input_nodes, // NOLINT + std::vector& output_nodes // NOLINT + ); // NOLINT + + private: + int freq_level_{3}; + int framework_type_{0}; + int model_type_{0}; + int device_type_{0}; +}; + +} // namespace hw_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/hw_ascend_npu/runtime.cc b/lite/backends/hw_ascend_npu/runtime.cc new file mode 100644 index 0000000000..d6f39c8ce7 --- /dev/null +++ b/lite/backends/hw_ascend_npu/runtime.cc @@ -0,0 +1,296 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/hw_ascend_npu/runtime.h" +#include "lite/backends/hw_ascend_npu/target_wrapper.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace hw_ascend_npu { +HWAscendNPURuntime::HWAscendNPURuntime( + std::shared_ptr model_buff_built, size_t model_buff_size) { + model_loaded_ = (0 == LoadModelFromMem(model_buff_built, model_buff_size)); +} + +HWAscendNPURuntime::~HWAscendNPURuntime() { + UnloadModel(); + DestroyDesc(); + DestroyInput(); + DestroyOutput(); +} + +int HWAscendNPURuntime::LoadModelFromMem( + std::shared_ptr model_buff_built, size_t model_buff_size) { + if (model_loaded_) { + LOG(ERROR) << "[HWAscendNPU]: Has already loaded a model"; + return 0; + } + aclError ret = aclmdlQuerySizeFromMem(model_buff_built.get(), + model_buff_size, + &model_size_, + &model_weights_size_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU]: Can query size from a built model buffer, " + "error code: " + << ret; + return ret; + } + + ret = aclrtMalloc(&model_ptr_, model_size_, ACL_MEM_MALLOC_NORMAL_ONLY); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model, " + "error code: " + << ret; + return ret; + } + + ret = aclrtMalloc( + &model_weights_ptr_, model_weights_size_, ACL_MEM_MALLOC_NORMAL_ONLY); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model " + "weights, error code: " + << ret; + return ret; + } + + ret = aclmdlLoadFromMemWithMem(model_buff_built.get(), + model_buff_size, + &model_id_, + model_ptr_, + model_size_, + model_weights_ptr_, + model_weights_size_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU]: Can not Load model from memory, error code: " + << ret; + return ret; + } + + model_desc_ = aclmdlCreateDesc(); + if (model_desc_ == nullptr) { + LOG(ERROR) << "HWAscendNPU]: Can not create model descriptor."; + return ACL_ERROR_FAILURE; + } + + ret = aclmdlGetDesc(model_desc_, model_id_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU]: Can not get model descriptor from model, " + "error code: " + << ret; + return ret; + } + return ret; +} + +int HWAscendNPURuntime::CreateInput(const std::vector& idims) { + if (itensors_ != nullptr) { + DestroyInput(); + } + itensors_ = aclmdlCreateDataset(); + if (itensors_ == nullptr) { + LOG(ERROR) << "[HWAscendNPU]: Can not create input dataset"; + return ACL_ERROR_FAILURE; + } + + for (auto& dim : idims) { + void* buff_dev_ptr = nullptr; + CHECK(ACL_ERROR_NONE == aclrtMalloc(&buff_dev_ptr, + dim.production(), + ACL_MEM_MALLOC_NORMAL_ONLY)); + aclDataBuffer* input_data_buffer = + aclCreateDataBuffer(buff_dev_ptr, dim.production()); + CHECK(input_data_buffer != nullptr); + CHECK(ACL_ERROR_NONE == + aclmdlAddDatasetBuffer(itensors_, input_data_buffer)); + } + return 0; +} + +int HWAscendNPURuntime::CreateOutput(const std::vector& odims) { + if (otensors_ != nullptr) { + DestroyOutput(); + } + otensors_ = aclmdlCreateDataset(); + if (otensors_ == nullptr) { + LOG(ERROR) << "[HWAscendNPU]: Can not create output dataset"; + return ACL_ERROR_FAILURE; + } + + for (auto& dim : odims) { + void* buff_dev_ptr = nullptr; + CHECK(ACL_ERROR_NONE == aclrtMalloc(&buff_dev_ptr, + dim.production(), + ACL_MEM_MALLOC_NORMAL_ONLY)); + aclDataBuffer* output_data_buffer = + aclCreateDataBuffer(buff_dev_ptr, dim.production()); + CHECK(output_data_buffer != nullptr); + CHECK(ACL_ERROR_NONE == + aclmdlAddDatasetBuffer(otensors_, output_data_buffer)); + } + return 0; +} + +void HWAscendNPURuntime::UnloadModel() { + if (!model_loaded_) { + LOG(ERROR) << "[HWAscendNPU]: No model has been loaded"; + return; + } + aclError ret = ACL_ERROR_NONE; + ret = aclmdlUnload(model_id_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU]: Unload a model, error code: " << ret; + return; + } + + if (model_ptr_) { + aclrtFree(model_ptr_); + model_ptr_ = nullptr; + } + + if (model_weights_ptr_) { + aclrtFree(model_weights_ptr_); + model_weights_ptr_ = nullptr; + } + model_loaded_ = false; +} + +void HWAscendNPURuntime::DestroyDesc() { + if (model_desc_) { + (void)aclmdlDestroyDesc(model_desc_); + model_desc_ = nullptr; + } +} + +void HWAscendNPURuntime::DestroyInput() { + if (itensors_ == nullptr) { + return; + } + size_t buf_num = aclmdlGetDatasetNumBuffers(itensors_); + for (size_t i = 0; i < buf_num; ++i) { + aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i); + aclDestroyDataBuffer(data_buffer); + } + aclmdlDestroyDataset(itensors_); + itensors_ = nullptr; +} + +void HWAscendNPURuntime::DestroyOutput() { + if (otensors_ == nullptr) { + return; + } + size_t buf_num = aclmdlGetDatasetNumBuffers(otensors_); + for (size_t i = 0; i < buf_num; ++i) { + aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(otensors_, i); + aclDestroyDataBuffer(data_buffer); + } + aclmdlDestroyDataset(otensors_); + otensors_ = nullptr; +} + +int HWAscendNPURuntime::SetInput(const std::vector& itensors, + const std::vector& idims) { + CHECK(itensors.size() == idims.size()); + size_t input_tensor_num = itensors.size(); + for (size_t i = 0; i < input_tensor_num; ++i) { + CHECK(itensors[i]->memory_size() == idims[i].production()); + } + size_t num_buffers_in_dataset = aclmdlGetDatasetNumBuffers(itensors_); + if (num_buffers_in_dataset != input_tensor_num) { + if (0 != CreateInput(idims)) { + return -1; + } + } else { + bool need_to_create_input = false; + for (size_t i = 0; i < num_buffers_in_dataset; ++i) { + aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i); + int64_t buf_size = aclGetDataBufferSize(data_buffer); + if (buf_size != idims[i].production()) { + need_to_create_input = true; + } + } + if (need_to_create_input && 0 != CreateInput(idims)) { + return -1; + } + } + + // copy input data from host to device + for (size_t i = 0; i < input_tensor_num; ++i) { + aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(itensors_, i); + void* buf_dev_ptr = aclGetDataBufferAddr(data_buffer); + TargetWrapperHWAscendNPU::MemcpySync(buf_dev_ptr, + itensors[i]->raw_data(), + itensors[i]->memory_size(), + IoDirection::HtoD); + } + return 0; +} + +void HWAscendNPURuntime::GetOutput(const std::vector* otensors_ptr) { + CHECK(otensors_ptr != nullptr); + size_t num_output = aclmdlGetDatasetNumBuffers(otensors_); + const std::vector otensors = *otensors_ptr; + + CHECK(num_output == otensors.size()); + for (size_t i = 0; i < num_output; ++i) { + aclDataBuffer* data_buffer = aclmdlGetDatasetBuffer(otensors_, i); + TargetWrapperHWAscendNPU::MemcpySync(otensors[i]->raw_data(), + aclGetDataBufferAddr(data_buffer), + aclGetDataBufferSize(data_buffer), + IoDirection::DtoH); + } +} + +int HWAscendNPURuntime::Process() { + aclError ret = aclmdlExecute(model_id_, itensors_, otensors_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU]: Execute model failed, model_id: " << model_id_ + << ", error code: " << ret; + } + return ret; +} + +int HWAscendNPURuntime::GetModelIOTensorDim(std::vector* idims, + std::vector* odims) { + aclError ret = ACL_ERROR_NONE; + size_t num_inputs = aclmdlGetNumInputs(model_desc_); + size_t num_outputs = aclmdlGetNumOutputs(model_desc_); + for (size_t i = 0; i < num_inputs; ++i) { + aclmdlIODims dims; + if (ret != aclmdlGetInputDims(model_desc_, i, &dims)) { + LOG(ERROR) << "[HWAscendNPU]: Get input dims failed, index: " << i; + return ret; + } + aclDataType data_type = aclmdlGetInputDataType(model_desc_, i); + aclFormat format = aclmdlGetInputFormat(model_desc_, i); + + idims->push_back(TensorDesc(data_type, dims, format)); + } + + for (size_t i = 0; i < num_outputs; ++i) { + aclmdlIODims dims; + if (ret != aclmdlGetOutputDims(model_desc_, i, &dims)) { + LOG(ERROR) << "[HWAscendNPU]: Get output dims failed, index: " << i; + return ret; + } + aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i); + aclFormat format = aclmdlGetOutputFormat(model_desc_, i); + + odims->push_back(TensorDesc(data_type, dims, format)); + } + return 0; +} +} // namespace hw_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/hw_ascend_npu/runtime.h b/lite/backends/hw_ascend_npu/runtime.h new file mode 100644 index 0000000000..190bbff93a --- /dev/null +++ b/lite/backends/hw_ascend_npu/runtime.h @@ -0,0 +1,109 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +// hw_ascend_npu runtime library +#include +#include +#include +#include +#include "lite/core/tensor.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace hw_ascend_npu { +class TensorDesc { + public: + TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) { + tensor_desc_ = + aclCreateTensorDesc(data_type, dims.dimCount, dims.dims, format); + CHECK(tensor_desc_ != nullptr); + aclSetTensorDescName(tensor_desc_, dims.name); + if (format == ACL_FORMAT_NHWC) { + dim_order[1] = 3; + dim_order[2] = 1; + dim_order[3] = 2; + } + } + ~TensorDesc() { + if (tensor_desc_ != nullptr) { + aclDestroyTensorDesc(tensor_desc_); + tensor_desc_ = nullptr; + } + } + uint32_t GetNumber() const { + return static_cast( + aclGetTensorDescDim(tensor_desc_, dim_order[0])); + } + uint32_t GetChannel() const { + return static_cast( + aclGetTensorDescDim(tensor_desc_, dim_order[1])); + } + uint32_t GetHeight() const { + return static_cast( + aclGetTensorDescDim(tensor_desc_, dim_order[2])); + } + uint32_t GetWidth() const { + return static_cast( + aclGetTensorDescDim(tensor_desc_, dim_order[3])); + } + const aclTensorDesc& GetTensorDesc() const { return *tensor_desc_; } + + private: + aclTensorDesc* tensor_desc_{nullptr}; + // n c h w order, default to ACL_FORMAT_NCHW + std::vector dim_order{0, 1, 2, 3}; +}; + +class HWAscendNPURuntime { + public: + HWAscendNPURuntime(std::shared_ptr model_buff_built, + size_t model_buff_size); + ~HWAscendNPURuntime(); + + int SetInput(const std::vector& itensors, + const std::vector& idims); + void GetOutput(const std::vector* otensors_ptr); + int Process(); + bool model_loaded() const { return model_loaded_; } + int CreateInput(const std::vector& idims); + int CreateOutput(const std::vector& odims); + int GetModelIOTensorDim(std::vector* idims, + std::vector* odims); + + private: + int LoadModelFromMem(std::shared_ptr model_buff_built, + size_t model_buff_size); + + void UnloadModel(); + void DestroyDesc(); + void DestroyInput(); + void DestroyOutput(); + + private: + aclmdlDataset* itensors_{nullptr}; + aclmdlDataset* otensors_{nullptr}; + uint32_t model_id_{0}; + void* model_ptr_{nullptr}; + void* model_weights_ptr_{nullptr}; + size_t model_size_{0}; + size_t model_weights_size_{0}; + bool model_loaded_{false}; + aclmdlDesc* model_desc_; +}; +} // namespace hw_ascend_npu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/hw_ascend_npu/target_wrapper.cc b/lite/backends/hw_ascend_npu/target_wrapper.cc new file mode 100644 index 0000000000..0f07725676 --- /dev/null +++ b/lite/backends/hw_ascend_npu/target_wrapper.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/hw_ascend_npu/target_wrapper.h" +#include +#include + +namespace paddle { +namespace lite { + +void* TargetWrapperHWAscendNPU::Malloc(size_t size) { + void* ptr{nullptr}; + if (ACL_ERROR_NONE != aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_NORMAL_ONLY)) { + LOG(ERROR) << "[HWAscendNPU]: Allocate memory from device failed"; + ptr = nullptr; + } + return ptr; +} + +void TargetWrapperHWAscendNPU::Free(void* ptr) { aclrtFree(ptr); } + +void TargetWrapperHWAscendNPU::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + switch (dir) { + case IoDirection::HtoD: + aclrtMemcpy(dst, size, src, size, ACL_MEMCPY_HOST_TO_DEVICE); + break; + case IoDirection::DtoH: + aclrtMemcpy(dst, size, src, size, ACL_MEMCPY_DEVICE_TO_HOST); + break; + default: + LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/hw_ascend_npu/target_wrapper.h b/lite/backends/hw_ascend_npu/target_wrapper.h new file mode 100644 index 0000000000..08c2f8d3f4 --- /dev/null +++ b/lite/backends/hw_ascend_npu/target_wrapper.h @@ -0,0 +1,38 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { +template <> +class TargetWrapper { + public: + static size_t num_devices() { return 4; } + static size_t maximum_stream() { return 0; } + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); +}; + +using TargetWrapperHWAscendNPU = TargetWrapper; +} // namespace lite +} // namespace paddle diff --git a/lite/core/context.h b/lite/core/context.h index d20ccde592..36738415e6 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -535,6 +535,7 @@ class ContextScheduler { .As() .CopySharedTo(&ctx->As()); break; +#endif #ifdef LITE_WITH_MLU case TARGET(kMLU): { int dev_id = TargetWrapper::GetCurDevice(); diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 904bdcec0e..7193622333 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -17,7 +17,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} - HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels} + HWAscendNPU_DEPS ${hw_ascend_npu_kernels} FPGA_DEPS ${fpga_kernels} EXCLUDE_COMPILE_DEPS "ON" ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) diff --git a/lite/kernels/hw_ascend_npu/CMakeLists.txt b/lite/kernels/hw_ascend_npu/CMakeLists.txt index 0b4ff1352b..dbce1f82fd 100644 --- a/lite/kernels/hw_ascend_npu/CMakeLists.txt +++ b/lite/kernels/hw_ascend_npu/CMakeLists.txt @@ -1 +1,4 @@ add_subdirectory(bridges) + +add_kernel(subgraph_compute_hw_ascend_npu HWAscendNPU basic SRCS subgraph_compute.cc DEPS + ${lite_kernel_deps} device_hw_ascend_npu subgraph_bridge_engine ${hw_ascend_npu_subgraph_bridges}) diff --git a/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt b/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt index 4b00fa549b..a9426b6e84 100644 --- a/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt +++ b/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt @@ -2,11 +2,15 @@ if (NOT LITE_WITH_HW_ASCEND_NPU) return() endif() -message(STATUS "======compile hw_ascend_npu bridges, ${ascend_builder_libs}") +message(STATUS "======compile hw_ascend_npu bridges, ${hw_ascend_npu_builder_libs}") + +lite_cc_library(subgraph_bridge_utility_hw_ascend_npu + SRCS utility.cc + DEPS ${hw_ascend_npu_builder_libs} tensor) lite_cc_library(subgraph_bridge_graph_hw_ascend_npu SRCS graph.cc - DEPS ${ascend_builder_libs}) + DEPS ${hw_ascend_npu_builder_libs} subgraph_bridge_utility_hw_ascend_npu) set(hw_ascend_npu_subgraph_bridges subgraph_bridge_graph_hw_ascend_npu diff --git a/lite/kernels/hw_ascend_npu/bridges/act_op.cc b/lite/kernels/hw_ascend_npu/bridges/act_op.cc new file mode 100644 index 0000000000..03808631a1 --- /dev/null +++ b/lite/kernels/hw_ascend_npu/bridges/act_op.cc @@ -0,0 +1,164 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/hw_ascend_npu/bridges/graph.h" +#include "lite/kernels/hw_ascend_npu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace hw_ascend_npu { + +template +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[HWAscendNPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindTensor(x_name); + + auto out_name = op_info->Output("Out").front(); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x); + } + + // Act node + auto act_node = graph->template Add(out_name); + auto act_op = act_node->template data(); + act_op->set_input_x(*x_node->data()); + + return SUCCESS; +} + +template <> +int ActConverter(void* ctx, + OpLite* op, + KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[HWAscendNPU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + + // X node + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x); + } + + // Act node + auto act_node = graph->template Add(out_name); + auto act_op = act_node->template data(); + act_op->set_input_x(*x_node->data()); + // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, + // clipped_relu etc. + act_op->set_attr_mode(CvtActMode(op_type)); + if (op_type == "relu_clipped") { + auto Relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); + act_op->set_attr_coef(Relu_clipped_coef); + } else if (op_type == "relu6") { + float Relu_clipped_coef = 6.f; + act_op->set_attr_coef(Relu_clipped_coef); + } else if (op_type == "leaky_relu") { + auto alpha = op_info->GetAttr("alpha"); + act_op->set_attr_negative_slope(alpha); + } else if (op_type == "hard_sigmoid") { + auto slope = op_info->GetAttr("slope"); + auto offset = op_info->GetAttr("offset"); + act_op->set_attr_negative_slope(slope); + act_op->set_attr_coef(offset); + } + return SUCCESS; +} + +} // namespace hw_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE( + sigmoid, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + relu, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + tanh, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + relu_clipped, + kNPU, + paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + relu6, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + leaky_relu, + kHWAscendNPU, + paddle::lite::subgraph::npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + abs, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + softsign, + kNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + softplus, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + hard_sigmoid, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); + +REGISTER_SUBGRAPH_BRIDGE( + log, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + square, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); +REGISTER_SUBGRAPH_BRIDGE( + sqrt, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ActConverter); diff --git a/lite/kernels/hw_ascend_npu/bridges/graph.cc b/lite/kernels/hw_ascend_npu/bridges/graph.cc index b5567aa828..eb8238aee6 100644 --- a/lite/kernels/hw_ascend_npu/bridges/graph.cc +++ b/lite/kernels/hw_ascend_npu/bridges/graph.cc @@ -14,8 +14,8 @@ #include "lite/kernels/hw_ascend_npu/bridges/graph.h" /// reference from opp package -#include #include +#include "lite/kernels/hw_ascend_npu/utility.h" namespace paddle { namespace lite { @@ -49,8 +49,9 @@ std::shared_ptr Graph::Add(const std::string& name, PrecisionType precision = tensor.precision(); if (tensor.persistable()) { // Const node - node = Add(name, precision, layout); - node->data()->set_attr_value(CvtTensor(tensor, shape, layout)); + node = Add(name, precision, layout); + node->data()->set_attr_value( + CvtTensor(tensor, shape, layout)); } else { // Data node node = Add(name, shape, precision, layout); @@ -63,10 +64,10 @@ std::shared_ptr Graph::Add(const std::string& name, std::vector shape, PrecisionType precision, DataLayoutType layout) { - auto node = Add(name, precision, layout); + auto node = Add(name, precision, layout); ge::TensorDesc desc( ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision)); - node->data()->update_input_desc_x(desc); + node->data()->update_input_desc_data(desc); return node; } diff --git a/lite/kernels/hw_ascend_npu/bridges/graph.h b/lite/kernels/hw_ascend_npu/bridges/graph.h index 27dde3a151..b2841bc158 100644 --- a/lite/kernels/hw_ascend_npu/bridges/graph.h +++ b/lite/kernels/hw_ascend_npu/bridges/graph.h @@ -14,13 +14,15 @@ #pragma once +// reference from atc package +#include #include #include #include #include #include -// reference from atc package -#include "graph/operators.h" +#include "graph/operator.h" +#include "graph/operator_reg.h" #include "lite/core/op_lite.h" #include "lite/core/tensor.h" diff --git a/lite/kernels/hw_ascend_npu/bridges/utility.cc b/lite/kernels/hw_ascend_npu/bridges/utility.cc new file mode 100644 index 0000000000..72cdba36b4 --- /dev/null +++ b/lite/kernels/hw_ascend_npu/bridges/utility.cc @@ -0,0 +1,163 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/hw_ascend_npu/utility.h" +#include +#include input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} + +ge::DataType CvtPrecisionType(PrecisionType itype) { + ge::DataType otype = ge::DT_FLOAT; + switch (itype) { + case PRECISION(kFloat): + otype = ge::DT_FLOAT; + break; + case PRECISION(kInt8): + otype = ge::DT_INT8; + break; + case PRECISION(kInt32): + otype = ge::DT_INT32; + break; + case PRECISION(kFP16): + otype = ge::DT_FLOAT16; + break; + case PRECISION(kBool): + otype = ge::DT_BOOL; + break; + case PRECISION(kInt64): + otype = ge::DT_INT64; + break; + case PRECISION(kInt16): + otype = ge::DT_INT16; + break; + default: + LOG(FATAL) << "[HW_ASCEND_NPU] Can not convert precision type(" + << PrecisionToStr(itype) << ") from Lite to HW_ASCEND_NPU"; + break; + } + return otype; +} + +ge::Format CvtDataLayoutType(DataLayoutType itype) { + ge::Format otype = ge::FORMAT_NCHW; + switch (itype) { + case DATALAYOUT(kNCHW): + otype = ge::FORMAT_NCHW; + break; + case DATALAYOUT(kNHWC): + otype = ge::FORMAT_NHWC; + break; + // TODO(yanghongtian): support more data layout type + default: + LOG(FATAL) << "[HW_ASCEND_NPU] Can not convert data layout type(" + << DataLayoutToStr(itype) << ") from Lite to HW_ASCEND_NPU"; + break; + } + return otype; +} + +std::vector CvtShape(const std::vector& in_shape) { + CHECK(in_shape.size() <= 4 && in_shape.size() > 0) + << "[HW_ASCEND_NPU] The size of in_shape is invalid: " << in_shape.size(); + // Padding the shape to 4-dimensions(NCHW) + std::vector out_shape(4, 1); + std::copy(in_shape.begin(), + in_shape.end(), + out_shape.begin() + 4 - in_shape.size()); + return out_shape; +} + +std::vector CvtShape(const DDim& in_dims) { + return CvtShape(in_dims.Vectorize()); +} + +ge::Tensor CvtTensor(const Tensor& in_tensor, + std::vector out_shape, + DataLayoutType in_layout) { + PrecisionType in_precision = in_tensor.precision(); + auto in_size = in_tensor.dims().production(); + auto in_shape = in_tensor.dims().Vectorize(); + if (out_shape.empty()) { + out_shape = in_shape; + } + ge::TensorDesc out_desc(ge::Shape(out_shape), + CvtDataLayoutType(in_layout), + CvtPrecisionType(in_precision)); + auto out_size = out_desc.GetShape().GetShapeSize(); + CHECK_EQ(out_size, in_size); + ge::Tensor out_tensor; + out_tensor.SetTensorDesc(out_desc); + out_tensor.SetData(reinterpret_cast(in_tensor.raw_data()), + in_tensor.memory_size()); + return std::move(out_tensor); +} + +int CvtActMode(const std::string& act_type) { + // based on the nonlinear_fuc_ops.h in OPP (line 210) + // default to Relu + int act_mode = 1; + if (act_type == "sigmoid") { + act_mode = 0; + } else if (act_type == "relu") { + act_mode = 1; + } else if (act_type == "tanh") { + act_mode = 2; + } else if (act_type == "relu_clipped" || act_type == "relu6") { + act_mode = 3; + } else if (act_type == "elu") { + act_mode = 4; + } else if (act_type == "leaky_relu") { + act_mode = 5; + } else if (act_type == "abs") { + act_mode = 6; + } else if (act_type == "relu1") { + // TODO(yanghongtian): check hw_ascend_npu supports relu1 or not. + act_mode = 7; + } else if (act_type == "softsign") { + act_mode = 8; + } else if (act_type == "softplus") { + act_mode = 9; + } else { + // TODO(yanghongtian): support more activation mode + LOG(FATAL) << "[NPU] Unsupported activation type " << act_type; + } + return act_mode; +} + +} // namespace hw_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/hw_ascend_npu/bridges/utility.h b/lite/kernels/hw_ascend_npu/bridges/utility.h new file mode 100644 index 0000000000..5e2fe9bd40 --- /dev/null +++ b/lite/kernels/hw_ascend_npu/bridges/utility.h @@ -0,0 +1,54 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "lite/core/op_lite.h" +#include "lite/utils/macros.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace hw_ascend_npu { + +// Type/tensor converters for converting Paddle type/tensor to hw ascend npu IR +// type +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); + +ge::DataType CvtPrecisionType(PrecisionType itype); + +ge::Format CvtDataLayoutType(DataLayoutType itype); + +// Padding the shape to 4-dimensions(NCHW) for HW_ASCEND_NPU +std::vector CvtShape(const std::vector& in_shape); + +std::vector CvtShape(const DDim& in_dims); + +ge::Tensor CvtTensor(const Tensor& in_tensor, + std::vector out_shape = {}, + DataLayoutType in_layout = DATALAYOUT(kNCHW)); + +int CvtActMode(std::string act_type); +} // namespace hw_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/hw_ascend_npu/subgraph_compute.cc b/lite/kernels/hw_ascend_npu/subgraph_compute.cc new file mode 100644 index 0000000000..b0ae774d70 --- /dev/null +++ b/lite/kernels/hw_ascend_npu/subgraph_compute.cc @@ -0,0 +1,260 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/hw_ascend_npu/subgraph_compute.h" +#include +#include +#include +#include "lite/backends/hw_ascend_npu/device.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/hw_ascend_npu/bridges/graph.h" +#include "lite/kernels/npu/bridges/paddle_use_bridges.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace hw_ascend_npu { + +int SubgraphEngine::BuildDeviceProgram() { + int status = 0; + // Convert all of ops and their input vars and weights and added into + // the HWAscendNPU IR graph + subgraph::hw_ascend_npu::Graph graph; + const auto& bridges = subgraph::Registry::Instance(); + for (auto& inst : origin_program_) { + auto op = const_cast(inst.op()); + CHECK(op); + op->CheckShape(); + op->InferShape(); + std::string op_type = op->op_info()->Type(); + if (!bridges.Exists(op_type, TARGET(kHWAscendNPU))) { + return subgraph::FAILED; + } + auto kernel = inst.kernel(); + status |= bridges.Select(op_type, TARGET(kHWAscendNPU))( + reinterpret_cast(&graph), op, const_cast(kernel)); + if (subgraph::CHECK_FAILED(status)) { + return subgraph::FAILED; + } + } + // Collect the valid input and output nodes in the HiAI IR graph and update + // the input and output names + device_inames_.clear(); + device_onames_.clear(); + std::vector device_inodes; + std::vector device_onodes; + for (auto& input_name : input_names_) { + if (graph.Has(input_name)) { + if (graph.Get(input_name)->is_data()) { + device_inodes.push_back(*graph.Get(input_name)->data()); + device_inames_.push_back(input_name); + } else { + LOG(WARNING) << "[HWAscendNPU] Input node " << input_name + << " is ignored because it is not a data node."; + } + } else { + LOG(WARNING) << "[HWAscendNPU] Input node " << input_name + << " is ignored because it does not exist."; + } + } + for (auto& output_name : output_names_) { + if (graph.Has(output_name)) { + device_onodes.push_back(*graph.Get(output_name)->data()); + device_onames_.push_back(output_name); + } else { + LOG(WARNING) << "[HWAscendNPU] Output node " << output_name + << " is ignored because it does not exist."; + } + } + CHECK(!device_inames_.empty()) + << "[HWAscendNPU] No input nodes found for building NPU model"; + CHECK(!device_onames_.empty()) + << "[HWAscendNPU] No output nodes found for building NPU model"; + + // Build the IR graph to om model as the device program + if (device_program_map_.count(inputs_shape_) > 0) { + return status; + } + auto device_client = + lite::hw_ascend_npu::Device::Global().Build(device_inodes, device_onodes); + if (device_client == nullptr) { + LOG(WARNING) << "[HWAscendNPU] Build model failed!"; + return subgraph::FAILED; + } + auto device_program = std::make_shared(device_client); + device_program_map_[inputs_shape_] = device_program; + + // Query and check the dimensions of valid input and output tensors + std::vector device_idims, device_odims; + if (device_program->client->GetModelIOTensorDim(&device_idims, + &device_odims) != 0) { + LOG(WARNING) << "[HWAscendNPU] Get the dimensions of input and output " + "tensors failed!"; + return subgraph::FAILED; + } + device_program->device_idims = device_idims; + device_program->device_odims = device_odims; + + CHECK_EQ(device_idims.size(), device_inames_.size()); + CHECK_EQ(device_odims.size(), device_onames_.size()); + origin_idims_.resize(device_inames_.size()); + origin_itensors_.resize(device_inames_.size()); + origin_odims_.resize(device_onames_.size()); + origin_otensors_.resize(device_onames_.size()); + + for (size_t i = 0; i < device_inames_.size(); i++) { + auto node = graph.Get(device_inames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); + origin_itensors_[i] = scope_->FindMutableTensor(device_inames_[i]); + CHECK(origin_itensors_[i]); + origin_idims_[i] = origin_itensors_[i]->dims(); + VLOG(3) << "[HWAscendNPU] Inputs[" << i << "] name: " << device_inames_[i] + << " precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout) << " dims: {" + << device_idims[i].GetNumber() << "," + << device_idims[i].GetChannel() << "," + << device_idims[i].GetHeight() << "," << device_idims[i].GetWidth() + << "}"; + // Prepare the device input tensors + CHECK_EQ(origin_idims_[i].production(), + device_idims[i].GetNumber() * device_idims[i].GetChannel() * + device_idims[i].GetHeight() * device_idims[i].GetWidth()); + } + device_program->origin_idims = origin_idims_; + + for (size_t i = 0; i < device_onames_.size(); i++) { + auto node = graph.Get(device_onames_[i]); + auto precision = node->precision(); + auto layout = node->layout(); + origin_otensors_[i] = scope_->FindMutableTensor(device_onames_[i]); + CHECK(origin_otensors_[i]); + origin_odims_[i] = origin_otensors_[i]->dims(); + VLOG(3) << "[HWAscendNPU] Outputs[" << i << "] name: " << device_onames_[i] + << " precision: " << PrecisionToStr(precision) + << " layout: " << DataLayoutToStr(layout) << " dims: {" + << device_odims[i].GetNumber() << "," + << device_odims[i].GetChannel() << "," + << device_odims[i].GetHeight() << "," << device_odims[i].GetWidth() + << "}"; + // Prepare the device output tensors + switch (precision) { + case PRECISION(kFloat): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kBool): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt8): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt16): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt32): + origin_otensors_[i]->mutable_data(); + break; + case PRECISION(kInt64): + origin_otensors_[i]->mutable_data(); + break; + default: + LOG(FATAL) << "[HWAscendNPU] " << device_onames_[i] + << " can't mutable data with precision type " + << PrecisionToStr(precision); + break; + } + device_program->origin_odims = origin_odims_; + + CHECK_EQ(origin_odims_[i].production(), + device_odims[i].GetNumber() * device_odims[i].GetChannel() * + device_odims[i].GetHeight() * device_odims[i].GetWidth()); + } + return status; +} + +int SubgraphEngine::LaunchDeviceProgram() { + // Copy the data of origin input tensors to the buffer of input HWAscendNPU + // tensors + auto device_program = device_program_map_[inputs_shape_]; + int ret = 0; + + ret = device_program->client->SetInput(origin_itensors_, + device_program->origin_idims); + if (ret != 0) { + return ret; + } + + device_program->client->CreateOutput(device_program->origin_odims); + + // run inference + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + auto start_time = GetCurrentUS(); + CHECK_EQ(device_program->client->Process(), 0); + VLOG(3) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time + << " us"; + + device_program->client->GetOutput(&origin_otensors_); + + return 0; +} + +bool SubgraphEngine::InputShapeChanged() { + std::vector> new_shape; + for (auto origin_itensor : origin_itensors_) { + new_shape.push_back(origin_itensor->dims().Vectorize()); + } + inputs_shape_ = new_shape; + if (device_program_map_.count(inputs_shape_) > 0) { + return false; + } + return true; +} + +void SubgraphCompute::PrepareForRun() { + auto& param = this->Param(); + engine_.reset(new SubgraphEngine(ctx_.get(), + param.sub_block_idx, + param.sub_block_desc, + param.input_data_names, + param.output_data_names, + param.scope)); + CHECK(engine_); + engine_->Build(); +} + +void SubgraphCompute::Run() { + CHECK(engine_); + engine_->Launch(); +} + +} // namespace hw_ascend_npu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(subgraph, + kHWAscendNPU, + kAny, + kNCHW, + paddle::lite::kernels::hw_ascend_npu::SubgraphCompute, + def) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))}) + .Finalize(); diff --git a/lite/kernels/hw_ascend_npu/subgraph_compute.h b/lite/kernels/hw_ascend_npu/subgraph_compute.h new file mode 100644 index 0000000000..1cab254def --- /dev/null +++ b/lite/kernels/hw_ascend_npu/subgraph_compute.h @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "lite/backends/hw_ascend_npu/runtime.h" +#include "lite/core/kernel.h" +#include "lite/kernels/npu/bridges/engine.h" +#include "lite/kernels/npu/bridges/registry.h" +using HWAscendNPURuntime = paddle::lite::hw_ascend_npu::HWAscendNPURuntime; +using TensorDesc = paddle::lite::hw_ascend_npu::TensorDesc; +namespace paddle { +namespace lite { +namespace kernels { +namespace hw_ascend_npu { + +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext *ctx, + int block_idx, + cpp::BlockDesc *block_desc, + const std::vector &input_names, + const std::vector &output_names, + Scope *scope) + : subgraph::Engine( + ctx, block_idx, block_desc, input_names, output_names, scope) {} + + struct device_program_t { + explicit device_program_t(std::shared_ptr _client) + : client(_client) {} + std::shared_ptr client{nullptr}; + std::vector origin_idims{}; + std::vector origin_odims{}; + std::vector device_idims{}; + std::vector device_odims{}; + }; + + protected: + int BuildDeviceProgram() override; + int LaunchDeviceProgram() override; + bool InputShapeChanged() override; + + std::vector> inputs_shape_{}; + std::map>, std::shared_ptr> + device_program_map_{}; + std::vector device_inames_{}; + std::vector device_onames_{}; +}; + +class SubgraphCompute + : public KernelLite { + public: + using param_t = operators::SubgraphParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~SubgraphCompute() = default; + + private: + std::unique_ptr engine_; +}; + +} // namespace hw_ascend_npu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt index e53bd60c6b..13fb1a227b 100644 --- a/lite/kernels/npu/bridges/CMakeLists.txt +++ b/lite/kernels/npu/bridges/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) +if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_HW_ASCEND_NPU) return() endif() -- GitLab