From 5cab7cdd5a002c968cedf5eb0872ce9d77370e93 Mon Sep 17 00:00:00 2001 From: yanghongtian Date: Fri, 29 May 2020 16:39:17 +0800 Subject: [PATCH] collect all --- cmake/device/hw_ascend_npu.cmake | 116 ++++++--- cmake/lite.cmake | 32 +-- lite/api/CMakeLists.txt | 2 +- lite/backends/hw_ascend_npu/CMakeLists.txt | 24 +- lite/backends/hw_ascend_npu/device.cc | 91 +++++-- lite/backends/hw_ascend_npu/device.h | 25 +- lite/backends/hw_ascend_npu/runtime.cc | 7 +- lite/kernels/hw_ascend_npu/CMakeLists.txt | 4 +- .../hw_ascend_npu/bridges/CMakeLists.txt | 18 +- lite/kernels/hw_ascend_npu/bridges/act_op.cc | 17 +- .../hw_ascend_npu/bridges/concat_op.cc | 70 +++++ lite/kernels/hw_ascend_npu/bridges/conv_op.cc | 242 ++++++++++++++++++ lite/kernels/hw_ascend_npu/bridges/graph.cc | 10 +- lite/kernels/hw_ascend_npu/bridges/graph.h | 3 +- lite/kernels/hw_ascend_npu/bridges/utility.h | 2 +- .../kernels/hw_ascend_npu/subgraph_compute.cc | 27 +- lite/kernels/npu/bridges/engine.cc | 4 + lite/tests/kernels/CMakeLists.txt | 3 +- lite/tests/kernels/activation_compute_test.cc | 1 + lite/tools/build_hw_ascend_npu.sh | 4 +- 20 files changed, 603 insertions(+), 99 deletions(-) create mode 100644 lite/kernels/hw_ascend_npu/bridges/concat_op.cc create mode 100644 lite/kernels/hw_ascend_npu/bridges/conv_op.cc diff --git a/cmake/device/hw_ascend_npu.cmake b/cmake/device/hw_ascend_npu.cmake index 6a4b95192b..9babaff3e1 100644 --- a/cmake/device/hw_ascend_npu.cmake +++ b/cmake/device/hw_ascend_npu.cmake @@ -24,33 +24,6 @@ if(NOT DEFINED ASCEND_HOME) endif() message(STATUS "LITE_WITH_HW_ASCEND_NPU: ${LITE_WITH_HW_ASCEND_NPU}") -find_path(ACL_INC NAMES acl/acl.h - PATHS ${ASCEND_HOME}/acllib/include NO_DEFAULT_PATH) -if(NOT ACL_INC) - message(FATAL_ERROR "Can not find acl/acl.h in ${ASCEND_HOME}/include") -endif() - -include_directories("${ACL_INC}") - -set(ACL_LIB_FILES - acl_dvpp - ascendcl - register - runtime - ) - -foreach (libname ${ACL_LIB_FILES}) - find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/acllib/lib64) - if (lib_name_path_${libname}) - add_library(acl_${libname} SHARED IMPORTED GLOBAL) - set_property(TARGET acl_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}}) - list(APPEND acl_libs acl_${libname}) - else() - message(FATAL_ERROR "can not find library: ${libname}") - endif() -endforeach() - -set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs") # find atc include folder and library find_path(ATC_INC NAMES ge/ge_ir_build.h @@ -61,6 +34,8 @@ endif() include_directories("${ATC_INC}") set(ATC_LIB_FILES + ge_compiler + graph _caffe_parser auto_tiling c_sec @@ -76,9 +51,7 @@ set(ATC_LIB_FILES fmk_tensorflow_parser ge_client ge_common - ge_compiler ge_executor - graph mmpa msprof parser_common @@ -92,6 +65,16 @@ set(ATC_LIB_FILES tvm_runtime tvm_topi ) +set(ATC_PLUGIN_NNENGIN_LIB_FILES + engine + ) +set(ATC_PLUGIN_OPSKERNEL_LIB_FILES + aicpu_engine + fe + ge_local_engine + rts_engine + ) + foreach (libname ${ATC_LIB_FILES}) find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64) @@ -104,6 +87,28 @@ foreach (libname ${ATC_LIB_FILES}) endif() endforeach() +foreach (libname ${ATC_PLUGIN_NNENGIN_LIB_FILES}) + find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64/plugin/nnengine) + if (lib_name_path_${libname}) + add_library(atc_${libname} SHARED IMPORTED GLOBAL) + set_property(TARGET atc_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}}) + list(APPEND atc_libs atc_${libname}) + else() + message(FATAL_ERROR "can not find library: ${libname}") + endif() +endforeach() + +foreach (libname ${ATC_PLUGIN_OPSKERNEL_LIB_FILES}) + find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64/plugin/opskernel) + if (lib_name_path_${libname}) + add_library(atc_${libname} SHARED IMPORTED GLOBAL) + set_property(TARGET atc_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}}) + list(APPEND atc_libs atc_${libname}) + else() + message(FATAL_ERROR "can not find library: ${libname}") + endif() +endforeach() + # find opp include folder and library find_path(OPP_INC NAMES all_ops.h PATHS ${ASCEND_HOME}/opp/op_proto/built-in/inc) @@ -139,10 +144,59 @@ else() set_property(TARGET opp_fusion_pass_vectorcore_lib PROPERTY IMPORTED_LOCATION ${OPP_FUSION_VECTORCORE}) endif() -set(hw_ascend_npu_builder_libs +add_library(hw_ascend_npu_builder_libs INTERFACE) +target_link_libraries(hw_ascend_npu_builder_libs INTERFACE + ${atc_libs} + opp_opsproto_lib + opp_fusion_pass_aicore_lib + opp_fusion_pass_vectorcore_lib) + +#set(hw_ascend_npu_builder_libs +# ${atc_libs} +# opp_opsproto_lib +# opp_fusion_pass_aicore_lib +# opp_fusion_pass_vectorcore_lib +# CACHE INTERNAL "ascend builder libs") + +# find ascend cl runtime library +find_path(ACL_INC NAMES acl/acl.h + PATHS ${ASCEND_HOME}/acllib/include NO_DEFAULT_PATH) +if(NOT ACL_INC) + message(FATAL_ERROR "Can not find acl/acl.h in ${ASCEND_HOME}/include") +endif() + +include_directories("${ACL_INC}") + +set(ACL_LIB_FILES + acl_dvpp + ascendcl + register + runtime + ) + +foreach (libname ${ACL_LIB_FILES}) + find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/acllib/lib64) + if (lib_name_path_${libname}) + add_library(acl_${libname} SHARED IMPORTED GLOBAL) + set_property(TARGET acl_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}}) + list(APPEND acl_libs acl_${libname}) + else() + message(FATAL_ERROR "can not find library: ${libname}") + endif() +endforeach() + +add_library(hw_ascend_npu_runtime_libs INTERFACE) +target_link_libraries(hw_ascend_npu_runtime_libs INTERFACE ${acl_libs}) + +add_library(hw_ascend_npu_libs INTERFACE) +target_link_libraries(hw_ascend_npu_libs INTERFACE ${atc_libs} opp_opsproto_lib opp_fusion_pass_aicore_lib opp_fusion_pass_vectorcore_lib - CACHE INTERNAL "ascend builder libs") + ${acl_libs}) + +# set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs") + + diff --git a/cmake/lite.cmake b/cmake/lite.cmake index dc9d1fb9fc..fc52cd084d 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -23,7 +23,7 @@ function (lite_deps TARGET) set(options "") set(oneValueArgs "") set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS - CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS) + CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -138,7 +138,7 @@ function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS - XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) + XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps "") @@ -156,7 +156,7 @@ function(lite_cc_library TARGET) PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} - # MLU_DEPS ${args_MLU_DEPS} + MLU_DEPS ${args_MLU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} ) @@ -185,7 +185,7 @@ function(lite_cc_binary TARGET) endif() set(oneValueArgs "") set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS - XPU_DEPS PROFILE_DEPS HW_ASCEND_NPU_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) + XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps "") @@ -199,12 +199,12 @@ function(lite_cc_binary TARGET) NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} - BM_DEPS ${args_BM_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${CV_DEPS} - # MLU_DEPS ${args_MLU_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) @@ -235,7 +235,7 @@ function(lite_cc_test TARGET) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS - XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS HW_ASCEND_NPU_DEPS + XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) ) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -261,7 +261,7 @@ function(lite_cc_test TARGET) LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} CV_DEPS ${args_CV_DEPS} - # MLU_DEPS ${args_MLU_DEPS} + MLU_DEPS ${args_MLU_DEPS} ) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) # strip binary target to reduce size @@ -309,9 +309,9 @@ endif() function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS - LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS HW_ASCEND_NPU_DEPS - ARGS) + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS + XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS + ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if(LITE_BUILD_TAILOR) @@ -444,7 +444,7 @@ function(add_kernel TARGET device level) XPU_DEPS ${args_XPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} BM_DEPS ${args_BM_DEPS} - #MLU_DEPS ${args_MLU_DEPS} + MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -463,9 +463,9 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS - LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS - ARGS) + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS + XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS + ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) @@ -499,7 +499,7 @@ function(add_operator TARGET level) XPU_DEPS ${args_XPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} BM_DEPS ${args_BM_DEPS} - #MLU_DEPS ${args_MLU_DEPS} + MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 653b025c73..81cf164b10 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -69,7 +69,7 @@ if (WITH_TESTING) XPU_DEPS ${xpu_kernels} BM_DEPS ${bm_kernels} HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels} - # MLU_DEPS ${mlu_kernels} + MLU_DEPS ${mlu_kernels} ) endif() if(LITE_WITH_FPGA) diff --git a/lite/backends/hw_ascend_npu/CMakeLists.txt b/lite/backends/hw_ascend_npu/CMakeLists.txt index 5d36148caf..8fdd4295fb 100644 --- a/lite/backends/hw_ascend_npu/CMakeLists.txt +++ b/lite/backends/hw_ascend_npu/CMakeLists.txt @@ -2,12 +2,24 @@ if(NOT LITE_WITH_HW_ASCEND_NPU) return() endif() +lite_cc_library(build_hw_ascend_npu SRCS build.cc DEPS + hw_ascend_npu_libs) + +lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS + hw_ascend_npu_libs + build_hw_ascend_npu + ) lite_cc_library(target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS - ${hw_ascend_npu_runtime_libs}) + hw_ascend_npu_libs + build_hw_ascend_npu + ) lite_cc_library(runtime_hw_ascend_npu SRCS runtime.cc DEPS - ${hw_ascend_npu_runtime_libs} - target_wrapper_hw_ascend_npu) -lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS - ${hw_ascend_npu_runtime_libs} + hw_ascend_npu_libs target_wrapper_hw_ascend_npu - runtime_hw_ascend_npu) + device_hw_ascend_npu + build_hw_ascend_npu + ) + +add_executable(test_build test_build.cc) + +target_link_libraries(test_build build_hw_ascend_npu) diff --git a/lite/backends/hw_ascend_npu/device.cc b/lite/backends/hw_ascend_npu/device.cc index eb2ca933ec..b28f00e9c2 100644 --- a/lite/backends/hw_ascend_npu/device.cc +++ b/lite/backends/hw_ascend_npu/device.cc @@ -16,7 +16,7 @@ #include #include #include "ge/ge_api_types.h" -#include "lite/backends/hw_ascend_npu/runtime.h" +#include "lite/backends/hw_ascend_npu/build.h" #include "lite/utils/cp_logging.h" namespace paddle { @@ -26,33 +26,92 @@ std::shared_ptr Device::Build( std::vector& input_nodes, // NOLINT std::vector& output_nodes // NOLINT ) { - VLOG(3) << "[HWAscendNPU] Build model"; - // Build the IR graph to the om model - ge::Graph ir_graph("graph"); - ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes); - ge::ModelBufferData model; - - std::map build_options; - build_options.insert({ge::ir_option::EXEC_DISABLE_REUSED_MEMORY, "1"}); - - ge::graphStatus ret = aclgrphBuildModel(ir_graph, build_options, model); - - if (ret != ge::GRAPH_SUCCESS) { - LOG(ERROR) << "[HWAscendNPU] Build model failed, error code: " << ret; + std::shared_ptr model_data = + paddle::lite::hw_ascend_npu::Build(input_nodes, output_nodes); + if (model_data == nullptr) { + LOG(ERROR) << "[HWAscendNPU] Build model failed"; return nullptr; } + LOG(INFO) << "[HWAscendNPU] Build model success"; + if (!inited_) { + if (0 == InitDevice()) { + LOG(INFO) << "Init success."; + inited_ = true; + } + } std::shared_ptr model_runtime( - new HWAscendNPURuntime(model.data, model.length)); + new HWAscendNPURuntime(model_data->data, model_data->length)); CHECK(model_runtime != nullptr); if (!model_runtime->model_loaded()) { LOG(ERROR) << "[HWAscendNPU]: Can not create model runtime instance"; return nullptr; } - VLOG(3) << "[HWAscendNPU]: Build done"; + LOG(INFO) << "[HWAscendNPU]: Build done"; return model_runtime; } +int Device::InitDevice() { + const char* acl_conf = "/usr/local/acl.json"; + aclError ret = aclInit(acl_conf); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU] acl init failed"; + return -1; + } + + // open device + ret = aclrtSetDevice(device_id_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU] acl open device " << device_id_ << " failed"; + return -1; + } + + ret = aclrtCreateContext(&context_ptr_, device_id_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "acl create context failed"; + return -1; + } + + // create stream + ret = aclrtCreateStream(&stream_ptr_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU] acl create stream failed"; + return -1; + } + + // get run mode + aclrtGetRunMode runMode; + ret = aclrtGetMode(&runMode); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU] acl get run mode failed"; + return -1; + } + is_devcie_ = (runMode == ACL_DEVICE); + LOG(INFO) << "[HWAscendNPU] Hardware initialization done"; + return 0; +} + +void Device::ReleaseDevice() { + aclError ret; + if (stream_ptr_ != nullptr) { + ret = aclrtDestroyStream(stream_ptr_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU] destroy stream failed"; + } + stream_ptr_ = nullptr; + } + LOG(INFO) << "[HWAscendNPU] end to destroy stream"; + + if (context_ptr_ != nullptr) { + ret = aclrtDestroyContext(context_ptr_); + if (ret != ACL_ERROR_NONE) { + LOG(ERROR) << "[HWAscendNPU] destroy context failed"; + } + context_ptr_ = nullptr; + } + LOG(INFO) << "[HWAscendNPU] Release device successfully"; +} + } // namespace hw_ascend_npu } // namespace lite } // namespace paddle diff --git a/lite/backends/hw_ascend_npu/device.h b/lite/backends/hw_ascend_npu/device.h index ee820ead2b..5b3d8965a9 100644 --- a/lite/backends/hw_ascend_npu/device.h +++ b/lite/backends/hw_ascend_npu/device.h @@ -18,8 +18,9 @@ #include #include #include -#include "ge/ge_ir_build.h" // NOLINT +#include "ge/ge_ir_build.h" #include "lite/backends/hw_ascend_npu/runtime.h" +#include "lite/utils/cp_logging.h" namespace paddle { namespace lite { namespace hw_ascend_npu { @@ -30,12 +31,11 @@ class Device { static Device x; return x; } - Device() {} + Device() : inited_(false) {} - int freq_level() { return freq_level_; } - int framework_type() { return framework_type_; } - int model_type() { return model_type_; } - int device_type() { return device_type_; } + ~Device() { ReleaseDevice(); } + + bool is_device() const { return is_devcie_; } // Build the IR graph to om model, return a HWAscendNPURuntime instance to // load om model and run inference. @@ -45,10 +45,15 @@ class Device { ); // NOLINT private: - int freq_level_{3}; - int framework_type_{0}; - int model_type_{0}; - int device_type_{0}; + int InitDevice(); + void ReleaseDevice(); + + private: + bool inited_{false}; + int device_id_{0}; + bool is_devcie_{false}; + aclrtContext context_ptr_{nullptr}; + aclrtStream stream_ptr_{nullptr}; }; } // namespace hw_ascend_npu diff --git a/lite/backends/hw_ascend_npu/runtime.cc b/lite/backends/hw_ascend_npu/runtime.cc index d6f39c8ce7..9501198504 100644 --- a/lite/backends/hw_ascend_npu/runtime.cc +++ b/lite/backends/hw_ascend_npu/runtime.cc @@ -42,12 +42,15 @@ int HWAscendNPURuntime::LoadModelFromMem( &model_size_, &model_weights_size_); if (ret != ACL_ERROR_NONE) { - LOG(ERROR) << "[HWAscendNPU]: Can query size from a built model buffer, " + LOG(ERROR) << "[HWAscendNPU]: Can't query size from a built model buffer, " "error code: " - << ret; + << ret << ", model buffer size: " << model_buff_size; return ret; } + LOG(INFO) << "[HWAscendNPU]: Query model info success, model_size: " + << model_size_ << ", model weights_size_: " << model_weights_size_; + ret = aclrtMalloc(&model_ptr_, model_size_, ACL_MEM_MALLOC_NORMAL_ONLY); if (ret != ACL_ERROR_NONE) { LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model, " diff --git a/lite/kernels/hw_ascend_npu/CMakeLists.txt b/lite/kernels/hw_ascend_npu/CMakeLists.txt index df79bc9dd7..1b10d089a4 100644 --- a/lite/kernels/hw_ascend_npu/CMakeLists.txt +++ b/lite/kernels/hw_ascend_npu/CMakeLists.txt @@ -1,9 +1,11 @@ add_subdirectory(bridges) add_kernel(subgraph_compute_hw_ascend_npu HW_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS - ${lite_kernel_deps} + build_hw_ascend_npu device_hw_ascend_npu subgraph_bridge_engine + runtime_hw_ascend_npu ${hw_ascend_npu_subgraph_bridges} subgraph_bridge_registry + ${lite_kernel_deps} ) diff --git a/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt b/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt index 1618fef815..3e934bdf0e 100644 --- a/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt +++ b/lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt @@ -4,18 +4,24 @@ endif() lite_cc_library(subgraph_bridge_utility_hw_ascend_npu SRCS utility.cc - DEPS ${hw_ascend_npu_builder_libs} tensor) + DEPS hw_ascend_npu_libs tensor) lite_cc_library(subgraph_bridge_graph_hw_ascend_npu SRCS graph.cc - DEPS ${hw_ascend_npu_builder_libs} subgraph_bridge_utility_hw_ascend_npu - ) + DEPS hw_ascend_npu_libs subgraph_bridge_utility_hw_ascend_npu) -set(hw_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_hw_ascend_npu subgraph_bridge_graph_hw_ascend_npu) + +set(hw_ascend_npu_subgraph_bridge_deps + subgraph_bridge_registry + subgraph_bridge_utility_hw_ascend_npu + subgraph_bridge_graph_hw_ascend_npu) lite_cc_library(subgraph_bridge_act_op_hw_ascend_npu SRCS act_op.cc DEPS - ${hw_ascend_npu_subgraph_bridge_deps} - ${hw_ascend_npu_builder_libs}) + hw_ascend_npu_libs + ${hw_ascend_npu_subgraph_bridge_deps}) +#lite_cc_library(subgraph_bridge_concat_op_hw_ascend_npu SRCS concat_op.cc DEPS +# ${hw_ascend_npu_subgraph_bridge_deps} +# hw_ascend_npu_builder_libs) set(hw_ascend_npu_subgraph_bridges subgraph_bridge_graph_hw_ascend_npu diff --git a/lite/kernels/hw_ascend_npu/bridges/act_op.cc b/lite/kernels/hw_ascend_npu/bridges/act_op.cc index 5bbf544f91..ebbbd20619 100644 --- a/lite/kernels/hw_ascend_npu/bridges/act_op.cc +++ b/lite/kernels/hw_ascend_npu/bridges/act_op.cc @@ -30,7 +30,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { auto op_info = op->op_info(); auto op_type = op_info->Type(); auto scope = op->scope(); - VLOG(3) << "[HWAscendNPU] Converting " + op_type + "..."; + LOG(INFO) << "[HWAscendNPU] Converting " + op_type + "..."; // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); @@ -64,28 +64,35 @@ int ActConverter(void* ctx, auto op_info = op->op_info(); auto op_type = op_info->Type(); auto scope = op->scope(); - VLOG(3) << "[HWAscendNPU] Converting " + op_type + "..."; + LOG(INFO) << "[HWAscendNPU] Converting " + op_type + "..."; // Get input and output vars and op attributes auto x_name = op_info->Input("X").front(); auto x = scope->FindMutableTensor(x_name); auto x_dims = x->dims(); auto out_name = op_info->Output("Out").front(); + LOG(INFO) << "[HWAscendNPU] xname: " << x_name << ", dims: " << x_dims; // X node std::shared_ptr x_node = nullptr; if (graph->Has(x_name)) { + LOG(INFO) << "[HWAscendNPU] graph has node: " << x_name; x_node = graph->Get(x_name); } else { + LOG(INFO) << "[HWAscendNPU] graph does no have node: " << x_name; x_node = graph->Add(x_name, *x); } + LOG(INFO) << "[HWAscendNPU] out name: " << out_name; +#if 0 // Act node auto act_node = graph->template Add(out_name); auto act_op = act_node->template data(); act_op->set_input_x(*x_node->data()); // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // clipped_relu etc. + LOG(INFO) << "[HWAscendNPU] activation mode: " << op_type + << ", type: " << CvtActMode(op_type); act_op->set_attr_mode(CvtActMode(op_type)); if (op_type == "relu_clipped") { auto Relu_clipped_coef = op_info->GetAttr("Relu_clipped_coef"); @@ -94,6 +101,12 @@ int ActConverter(void* ctx, float Relu_clipped_coef = 6.f; act_op->set_attr_coef(Relu_clipped_coef); } +#else + // Act node + auto act_node = graph->template Add(out_name); + auto act_op = act_node->template data(); + act_op->set_input_x(*x_node->data()); +#endif return SUCCESS; } diff --git a/lite/kernels/hw_ascend_npu/bridges/concat_op.cc b/lite/kernels/hw_ascend_npu/bridges/concat_op.cc new file mode 100644 index 0000000000..bc2c7db961 --- /dev/null +++ b/lite/kernels/hw_ascend_npu/bridges/concat_op.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "lite/kernels/hw_ascend_npu/bridges/graph.h" +#include "lite/kernels/hw_ascend_npu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" +namespace paddle { +namespace lite { +namespace subgraph { +namespace hw_ascend_npu { + +int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[HWAscendNPU] Converting " << op_type << " ... "; + + // Get input and output vars and op attributes + auto x_names = op_info->Input("X"); + auto out_name = op_info->Output("Out").front(); + auto axis = op_info->GetAttr("axis"); + auto num = x_names.size(); + + // Traverse all of input nodes which are added into the new created concat + // node + auto concat_node = graph->Add(out_name); + auto concat_op = concat_node->data(); + concat_op->set_input_concat_dim(axis); + concat_op->set_attr_N(num); + concat_op->create_dynamic_input_input_values(num); + int idx = 1; + for (auto& x_name : x_names) { + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + x_node = graph->Get(x_name); + } else { + x_node = graph->Add(x_name, *x); + } + concat_op->set_dynamic_input_input_values(idx, *x_node->data()); + idx++; + } + return SUCCESS; +} + +} // namespace hw_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE( + concat, + kHWAscendNPU, + paddle::lite::subgraph::hw_ascend_npu::ConcatConverter); diff --git a/lite/kernels/hw_ascend_npu/bridges/conv_op.cc b/lite/kernels/hw_ascend_npu/bridges/conv_op.cc new file mode 100644 index 0000000000..94b4deda07 --- /dev/null +++ b/lite/kernels/hw_ascend_npu/bridges/conv_op.cc @@ -0,0 +1,242 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include "lite/kernels/hw_ascend_npu/bridges/graph.h" +#include "lite/kernels/hw_ascend_npu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace hw_ascend_npu { + +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[HwAscendNPU] Converting " << op_type << "... "; + + // Get input and output vars and op attributes + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + + auto filter_name = op_info->Input("Filter").front(); + auto filter = scope->FindMutableTensor(filter_name); + auto filter_dims = filter->dims(); + + auto output_name = op_info->Output("Output").front(); + auto output = scope->FindMutableTensor(output_name); + auto output_dims = output->dims(); + + auto bs = input_dims[0]; + auto ic = input_dims[1]; + auto oc = filter_dims[0]; + CHECK_EQ(input_dims.size(), 4L); + CHECK_EQ(output_dims.size(), 4L); + CHECK_EQ(filter_dims.size(), 4L); + CHECK_EQ(output_dims[0], bs); + CHECK_EQ(output_dims[1], oc); + auto strides = op_info->GetAttr>("strides"); + auto paddings = op_info->GetAttr>("paddings"); + auto groups = op_info->GetAttr("groups"); + auto dilations = op_info->GetAttr>("dilations"); + bool with_act = + op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); + std::string act_type = + with_act ? op_info->GetAttr("act_type") : ""; + float leaky_relu_alpha = act_type == "leaky_relu" + ? op_info->GetAttr("leaky_relu_alpha") + : 0.f; + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + + // Input node + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + input_node = graph->Get(input_name); + } else { + input_node = graph->Add(input_name, *input); + } + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) << "[HwAscendNPU] Paddings size should be the " + "same or twice as the input size."; + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + + // Check depthwise mode, and decide whether use ConvolutionDepthwise Op + bool use_depthwise_conv = + false; // Whether use ge::op::ConvolutionDepthwise ? + bool is_depthwise_mode = ic == groups && oc == groups; + if (is_depthwise_mode && + !((groups == 1 || groups >= 5) && dilations[0] == 1 && + dilations[1] == 1)) { + use_depthwise_conv = true; + LOG(WARNING) + << "[HwAscendNPU] For depthwise mode, dilation = 1 and groups >= 5 " + "(or groups = 1) is only supported in Convolution Op, so " + "force to use ConvolutionDepthwise Op, but may lead poor " + "performance."; + } + + // Filter node + auto filter_node = graph->Add(filter_name, *filter); + + // Add bias node if exists bias + // Supports the bias nodes with the following dimensions + // 0: {oc} + // 1: {1, oc, oh, ow} + // 2: {n, oc, oh, ow} + std::shared_ptr bias_node = nullptr; + bool is_channel_bias = false; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + if (graph->Has(bias_name)) { + bias_node = graph->Get(bias_name); + } else { + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + auto bias_data_size = bias_dims.production(); + auto output_data_size = output_dims.production(); + std::vector bias_shape; + if (bias_data_size == oc) { + // 0: {oc} + bias_shape = {1, oc, 1, 1}; + is_channel_bias = true; + } else if (bias_data_size == output_data_size / bs) { + // 1: {1, oc, oh, ow} + bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]}; + } else if (bias_data_size == output_data_size) { + // 2: {n, oc, oh, ow} + bias_shape = output_dims.Vectorize(); + } else { + LOG(WARNING) + << "[HwAscendNPU] Bias dimension " << bias_dims + << " isn't supported in conv2d Op when output dimension is " + << output_dims; + return FAILED; + } + bias_node = graph->Add(bias_name, *bias, bias_shape); + } + } + + // Conv node + std::shared_ptr conv_node = nullptr; + if (use_depthwise_conv && is_depthwise_mode) { + conv_node = graph->Add(output_name); + auto conv_op = conv_node->data(); + conv_op->set_input_x(*input_node->data()); + conv_op->set_input_filter(*filter_node->data()); + conv_op->set_attr_mode(1); + conv_op->set_attr_algo(0); + conv_op->set_attr_format(0); // NCHW + conv_op->set_attr_pad_mode(5); // VALID + conv_op->set_attr_group(groups); + conv_op->set_attr_pad(ge::AttrValue::LIST_INT( + {paddings[0], paddings[1], paddings[2], paddings[3]})); + conv_op->set_attr_dilation( + ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); + conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]})); + conv_op->set_attr_kernel( + ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); + // ConvolutionDepthwise Op doesn't support bias, so append Add node to + // support bias + if (bias_node != nullptr) { + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_node->data()); + add_op->set_input_x2(*bias_node->data()); + conv_node = add_node; + } + } else { + conv_node = graph->Add(output_name); + auto conv_op = conv_node->data(); + conv_op->set_input_x(*input_node->data()); + conv_op->set_input_w(*filter_node->data()); + conv_op->set_attr_mode(1); + // when padding_algorithm=="SAME", NPU is different from lite + if (padding_algorithm == "VALID") { + conv_op->set_attr_pad_mode(5); + } else { + conv_op->set_attr_pad_mode(0); + } + conv_op->set_attr_group(groups); + conv_op->set_attr_pad(ge::AttrValue::LIST_INT( + {paddings[0], paddings[1], paddings[2], paddings[3]})); + conv_op->set_attr_dilation( + ge::AttrValue::LIST_INT({dilations[0], dilations[1]})); + conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]})); + conv_op->set_attr_kernel( + ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]})); + // Convolution Op only support bias with dimension {1, oc, 1, 1}, + // so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow) + if (bias_node != nullptr) { + if (is_channel_bias) { + conv_op->set_input_b(*bias_node->data()); + } else { + auto add_node = graph->Add(output_name); + auto add_op = add_node->data(); + add_op->set_input_x1(*conv_node->data()); + add_op->set_input_x2(*bias_node->data()); + conv_node = add_node; + } + } + } + CHECK(conv_node); + + if (!act_type.empty()) { + auto act_node = graph->Add(output_name); + auto act_op = act_node->data(); + act_op->set_input_x(*conv_node->data()); + act_op->set_attr_mode(CvtActMode(act_type)); + if (act_type == "leaky_relu") { + act_op->set_attr_negative_slope(leaky_relu_alpha); + } else if (act_type == "relu6") { + act_op->set_attr_coef(6.f); + } + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace hw_ascend_npu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kHWAscendNPU, + paddle::lite::subgraph::npu::ConvConverter); +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kHWAscendNPU, + paddle::lite::subgraph::npu::ConvConverter); diff --git a/lite/kernels/hw_ascend_npu/bridges/graph.cc b/lite/kernels/hw_ascend_npu/bridges/graph.cc index 3e651edbae..c587647149 100644 --- a/lite/kernels/hw_ascend_npu/bridges/graph.cc +++ b/lite/kernels/hw_ascend_npu/bridges/graph.cc @@ -27,7 +27,8 @@ int Graph::Add(const std::string& name, std::shared_ptr node) { if (it != nodes_.end()) { // Only variable node can be shared with the same name if (!node->is_var() || !it->second.back()->is_var()) { - LOG(FATAL) << "[NPU] Const or data node " << name << " is redefined."; + LOG(FATAL) << "[HWAscendNPU] Const or data node " << name + << " is redefined."; return -1; } } else { @@ -65,6 +66,13 @@ std::shared_ptr Graph::Add(const std::string& name, PrecisionType precision, DataLayoutType layout) { auto node = Add(name, precision, layout); + std::stringstream iss; + iss << "[HWAscendNPU] Add data node, shape: "; + for (auto& s : shape) { + iss << s << ","; + } + iss << " name: " << name; + LOG(INFO) << iss.str(); ge::TensorDesc desc( ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision)); node->data()->update_input_desc_data(desc); diff --git a/lite/kernels/hw_ascend_npu/bridges/graph.h b/lite/kernels/hw_ascend_npu/bridges/graph.h index b2841bc158..33f09eb72b 100644 --- a/lite/kernels/hw_ascend_npu/bridges/graph.h +++ b/lite/kernels/hw_ascend_npu/bridges/graph.h @@ -181,13 +181,14 @@ class Graph { } std::shared_ptr Get(std::string name) { - CHECK(Has(name)) << "[NPU] Node " << name << " not found."; + CHECK(Has(name)) << "[HWAscendNPU] Node " << name << " not found."; return nodes_.at(name).back(); } bool Has(const std::string& name) { return nodes_.find(name) != nodes_.end(); } + size_t size() const { return nodes_.size(); } private: std::unordered_map>> nodes_; diff --git a/lite/kernels/hw_ascend_npu/bridges/utility.h b/lite/kernels/hw_ascend_npu/bridges/utility.h index 5e2fe9bd40..7c8b645940 100644 --- a/lite/kernels/hw_ascend_npu/bridges/utility.h +++ b/lite/kernels/hw_ascend_npu/bridges/utility.h @@ -47,7 +47,7 @@ ge::Tensor CvtTensor(const Tensor& in_tensor, std::vector out_shape = {}, DataLayoutType in_layout = DATALAYOUT(kNCHW)); -int CvtActMode(std::string act_type); +int CvtActMode(const std::string& act_type); } // namespace hw_ascend_npu } // namespace subgraph } // namespace lite diff --git a/lite/kernels/hw_ascend_npu/subgraph_compute.cc b/lite/kernels/hw_ascend_npu/subgraph_compute.cc index b7d27f5931..cbcee1659d 100644 --- a/lite/kernels/hw_ascend_npu/subgraph_compute.cc +++ b/lite/kernels/hw_ascend_npu/subgraph_compute.cc @@ -32,22 +32,31 @@ int SubgraphEngine::BuildDeviceProgram() { // the HWAscendNPU IR graph subgraph::hw_ascend_npu::Graph graph; const auto& bridges = subgraph::Registry::Instance(); + LOG(INFO) << "[HWAscendNPU] Build device program"; for (auto& inst : origin_program_) { auto op = const_cast(inst.op()); CHECK(op); op->CheckShape(); op->InferShape(); std::string op_type = op->op_info()->Type(); + LOG(INFO) << "[HWAscendNPU] trying to convert OP: " << op_type; if (!bridges.Exists(op_type, TARGET(kHWAscendNPU))) { + LOG(ERROR) << "[HWAscendNPU] OP: " << op_type + << " does not exist for target HWAscendNPU"; return subgraph::FAILED; } + LOG(INFO) << "[HWAscendNPU] OP: " << op_type << " exists for HWAscendNPU"; auto kernel = inst.kernel(); status |= bridges.Select(op_type, TARGET(kHWAscendNPU))( reinterpret_cast(&graph), op, const_cast(kernel)); if (subgraph::CHECK_FAILED(status)) { + LOG(ERROR) << "[HWAscendNPU] OP: " << op_type << " select kernel failed"; return subgraph::FAILED; } + LOG(INFO) << "[HWAscendNPU] OP: " << op_type + << " select kernel for HWAscendNPU"; } + LOG(INFO) << "[HWAscendNPU] Graph size: " << graph.size(); // Collect the valid input and output nodes in the HiAI IR graph and update // the input and output names device_inames_.clear(); @@ -55,8 +64,12 @@ int SubgraphEngine::BuildDeviceProgram() { std::vector device_inodes; std::vector device_onodes; for (auto& input_name : input_names_) { + LOG(INFO) << "[HWAscendNPU] input name: " << input_name; if (graph.Has(input_name)) { + LOG(INFO) << "[HWAscendNPU] Graph has input name: " << input_name; if (graph.Get(input_name)->is_data()) { + LOG(INFO) << "[HWAscendNPU] the current input name: " << input_name + << " is data"; device_inodes.push_back(*graph.Get(input_name)->data()); device_inames_.push_back(input_name); } else { @@ -82,10 +95,15 @@ int SubgraphEngine::BuildDeviceProgram() { CHECK(!device_onames_.empty()) << "[HWAscendNPU] No output nodes found for building NPU model"; + LOG(INFO) << "[HWAscendNPU] Graph size to build: " << graph.size(); + // Build the IR graph to om model as the device program if (device_program_map_.count(inputs_shape_) > 0) { return status; } + LOG(INFO) << "[HWAscendNPU] Start to build, device_inodes = " + << device_inodes.size() + << ", device_onodes = " << device_onodes.size(); auto device_client = lite::hw_ascend_npu::Device::Global().Build(device_inodes, device_onodes); if (device_client == nullptr) { @@ -188,12 +206,14 @@ int SubgraphEngine::LaunchDeviceProgram() { // tensors auto device_program = device_program_map_[inputs_shape_]; int ret = 0; + LOG(INFO) << "[HWAscendNPU] start to set input..."; ret = device_program->client->SetInput(origin_itensors_, device_program->origin_idims); if (ret != 0) { return ret; } + LOG(INFO) << "[HWAscendNPU] start to create output..."; device_program->client->CreateOutput(device_program->origin_odims); @@ -205,10 +225,11 @@ int SubgraphEngine::LaunchDeviceProgram() { }; auto start_time = GetCurrentUS(); CHECK_EQ(device_program->client->Process(), 0); - VLOG(3) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time - << " us"; + LOG(INFO) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time + << " us"; device_program->client->GetOutput(&origin_otensors_); + LOG(INFO) << "[HWAscendNPU] Get ouput done"; return 0; } @@ -238,7 +259,9 @@ void SubgraphCompute::PrepareForRun() { } void SubgraphCompute::Run() { + LOG(INFO) << "[HWAscendNPU] Start to run"; CHECK(engine_); + LOG(INFO) << "[HWAscendNPU] Start to call Launch"; engine_->Launch(); } diff --git a/lite/kernels/npu/bridges/engine.cc b/lite/kernels/npu/bridges/engine.cc index 9961d5f17e..8409b51d63 100644 --- a/lite/kernels/npu/bridges/engine.cc +++ b/lite/kernels/npu/bridges/engine.cc @@ -105,15 +105,19 @@ bool Engine::InputShapeChanged() { } int Engine::Launch() { + LOG(INFO) << "[HWAscendNPU] in Launch, start to build if needed"; // Rebuild device program when the shapes of input tensors have been changed. if (CHECK_SUCCESS(build_device_program_status_) && CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) && InputShapeChanged()) { Build(); } + LOG(INFO) << "[HWAscendNPU] launch program"; if (CHECK_FAILED(build_device_program_status_)) { + LOG(INFO) << "[HWAscendNPU] launch original program"; LaunchOriginProgram(); } else { + LOG(INFO) << "[HWAscendNPU] launch device program"; LaunchDeviceProgram(); } return 0; diff --git a/lite/tests/kernels/CMakeLists.txt b/lite/tests/kernels/CMakeLists.txt index c7c84f1d69..e62ba6e769 100644 --- a/lite/tests/kernels/CMakeLists.txt +++ b/lite/tests/kernels/CMakeLists.txt @@ -9,7 +9,8 @@ set (kernels ${host_kernels} ${hw_ascend_npu_kernels}) -if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_HW_ASCEND_NPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) +message(STATUS "======---------------------------------=================${hw_ascend_npu_kernels}") +if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels}) lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels}) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels}) diff --git a/lite/tests/kernels/activation_compute_test.cc b/lite/tests/kernels/activation_compute_test.cc index 7a0aa5e511..39f1b21631 100644 --- a/lite/tests/kernels/activation_compute_test.cc +++ b/lite/tests/kernels/activation_compute_test.cc @@ -293,6 +293,7 @@ TEST(Activation_relu, precision) { place = TARGET(kXPU); #elif defined(LITE_WITH_HW_ASCEND_NPU) place = TARGET(kHWAscendNPU); + std::cout << "-----------test relu with hw_ascend_npu" << std::endl; #else return; #endif diff --git a/lite/tools/build_hw_ascend_npu.sh b/lite/tools/build_hw_ascend_npu.sh index 62f950e216..592e9e15af 100755 --- a/lite/tools/build_hw_ascend_npu.sh +++ b/lite/tools/build_hw_ascend_npu.sh @@ -3,7 +3,7 @@ set -ex # global variables with default value ASCEND_HOME="/usr/local/Ascend" # Ascend SDK root directory -TARGET_NAME="test_subgraph_pass" # default target +TARGET_NAME="test_kernel_activation_compute" # default target BUILD_EXTRA=ON # ON(with sequence ops)/OFF WITH_TESTING=ON # ON/OFF @@ -80,7 +80,7 @@ function build_hw_ascend_npu { -DWITH_TESTING=${WITH_TESTING} \ -DASCEND_HOME=${HW_ASCEND_NPU_SDK_ROOT} - make -j$NUM_CORES_FOR_COMPILE + make $TARGET_NAME -j2 cd - echo "Done" -- GitLab