提交 5cab7cdd 编写于 作者: Y yanghongtian

collect all

上级 f6cf1f9f
...@@ -24,33 +24,6 @@ if(NOT DEFINED ASCEND_HOME) ...@@ -24,33 +24,6 @@ if(NOT DEFINED ASCEND_HOME)
endif() endif()
message(STATUS "LITE_WITH_HW_ASCEND_NPU: ${LITE_WITH_HW_ASCEND_NPU}") message(STATUS "LITE_WITH_HW_ASCEND_NPU: ${LITE_WITH_HW_ASCEND_NPU}")
find_path(ACL_INC NAMES acl/acl.h
PATHS ${ASCEND_HOME}/acllib/include NO_DEFAULT_PATH)
if(NOT ACL_INC)
message(FATAL_ERROR "Can not find acl/acl.h in ${ASCEND_HOME}/include")
endif()
include_directories("${ACL_INC}")
set(ACL_LIB_FILES
acl_dvpp
ascendcl
register
runtime
)
foreach (libname ${ACL_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/acllib/lib64)
if (lib_name_path_${libname})
add_library(acl_${libname} SHARED IMPORTED GLOBAL)
set_property(TARGET acl_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
list(APPEND acl_libs acl_${libname})
else()
message(FATAL_ERROR "can not find library: ${libname}")
endif()
endforeach()
set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs")
# find atc include folder and library # find atc include folder and library
find_path(ATC_INC NAMES ge/ge_ir_build.h find_path(ATC_INC NAMES ge/ge_ir_build.h
...@@ -61,6 +34,8 @@ endif() ...@@ -61,6 +34,8 @@ endif()
include_directories("${ATC_INC}") include_directories("${ATC_INC}")
set(ATC_LIB_FILES set(ATC_LIB_FILES
ge_compiler
graph
_caffe_parser _caffe_parser
auto_tiling auto_tiling
c_sec c_sec
...@@ -76,9 +51,7 @@ set(ATC_LIB_FILES ...@@ -76,9 +51,7 @@ set(ATC_LIB_FILES
fmk_tensorflow_parser fmk_tensorflow_parser
ge_client ge_client
ge_common ge_common
ge_compiler
ge_executor ge_executor
graph
mmpa mmpa
msprof msprof
parser_common parser_common
...@@ -92,6 +65,16 @@ set(ATC_LIB_FILES ...@@ -92,6 +65,16 @@ set(ATC_LIB_FILES
tvm_runtime tvm_runtime
tvm_topi tvm_topi
) )
set(ATC_PLUGIN_NNENGIN_LIB_FILES
engine
)
set(ATC_PLUGIN_OPSKERNEL_LIB_FILES
aicpu_engine
fe
ge_local_engine
rts_engine
)
foreach (libname ${ATC_LIB_FILES}) foreach (libname ${ATC_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64) find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64)
...@@ -104,6 +87,28 @@ foreach (libname ${ATC_LIB_FILES}) ...@@ -104,6 +87,28 @@ foreach (libname ${ATC_LIB_FILES})
endif() endif()
endforeach() endforeach()
foreach (libname ${ATC_PLUGIN_NNENGIN_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64/plugin/nnengine)
if (lib_name_path_${libname})
add_library(atc_${libname} SHARED IMPORTED GLOBAL)
set_property(TARGET atc_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
list(APPEND atc_libs atc_${libname})
else()
message(FATAL_ERROR "can not find library: ${libname}")
endif()
endforeach()
foreach (libname ${ATC_PLUGIN_OPSKERNEL_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/atc/lib64/plugin/opskernel)
if (lib_name_path_${libname})
add_library(atc_${libname} SHARED IMPORTED GLOBAL)
set_property(TARGET atc_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
list(APPEND atc_libs atc_${libname})
else()
message(FATAL_ERROR "can not find library: ${libname}")
endif()
endforeach()
# find opp include folder and library # find opp include folder and library
find_path(OPP_INC NAMES all_ops.h find_path(OPP_INC NAMES all_ops.h
PATHS ${ASCEND_HOME}/opp/op_proto/built-in/inc) PATHS ${ASCEND_HOME}/opp/op_proto/built-in/inc)
...@@ -139,10 +144,59 @@ else() ...@@ -139,10 +144,59 @@ else()
set_property(TARGET opp_fusion_pass_vectorcore_lib PROPERTY IMPORTED_LOCATION ${OPP_FUSION_VECTORCORE}) set_property(TARGET opp_fusion_pass_vectorcore_lib PROPERTY IMPORTED_LOCATION ${OPP_FUSION_VECTORCORE})
endif() endif()
set(hw_ascend_npu_builder_libs add_library(hw_ascend_npu_builder_libs INTERFACE)
target_link_libraries(hw_ascend_npu_builder_libs INTERFACE
${atc_libs}
opp_opsproto_lib
opp_fusion_pass_aicore_lib
opp_fusion_pass_vectorcore_lib)
#set(hw_ascend_npu_builder_libs
# ${atc_libs}
# opp_opsproto_lib
# opp_fusion_pass_aicore_lib
# opp_fusion_pass_vectorcore_lib
# CACHE INTERNAL "ascend builder libs")
# find ascend cl runtime library
find_path(ACL_INC NAMES acl/acl.h
PATHS ${ASCEND_HOME}/acllib/include NO_DEFAULT_PATH)
if(NOT ACL_INC)
message(FATAL_ERROR "Can not find acl/acl.h in ${ASCEND_HOME}/include")
endif()
include_directories("${ACL_INC}")
set(ACL_LIB_FILES
acl_dvpp
ascendcl
register
runtime
)
foreach (libname ${ACL_LIB_FILES})
find_library(lib_name_path_${libname} NAMES ${libname} PATHS ${ASCEND_HOME}/acllib/lib64)
if (lib_name_path_${libname})
add_library(acl_${libname} SHARED IMPORTED GLOBAL)
set_property(TARGET acl_${libname} PROPERTY IMPORTED_LOCATION ${lib_name_path_${libname}})
list(APPEND acl_libs acl_${libname})
else()
message(FATAL_ERROR "can not find library: ${libname}")
endif()
endforeach()
add_library(hw_ascend_npu_runtime_libs INTERFACE)
target_link_libraries(hw_ascend_npu_runtime_libs INTERFACE ${acl_libs})
add_library(hw_ascend_npu_libs INTERFACE)
target_link_libraries(hw_ascend_npu_libs INTERFACE
${atc_libs} ${atc_libs}
opp_opsproto_lib opp_opsproto_lib
opp_fusion_pass_aicore_lib opp_fusion_pass_aicore_lib
opp_fusion_pass_vectorcore_lib opp_fusion_pass_vectorcore_lib
CACHE INTERNAL "ascend builder libs") ${acl_libs})
# set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs")
...@@ -23,7 +23,7 @@ function (lite_deps TARGET) ...@@ -23,7 +23,7 @@ function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS
CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS) CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -138,7 +138,7 @@ function(lite_cc_library TARGET) ...@@ -138,7 +138,7 @@ function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps "") set(deps "")
...@@ -156,7 +156,7 @@ function(lite_cc_library TARGET) ...@@ -156,7 +156,7 @@ function(lite_cc_library TARGET)
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
# MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
) )
...@@ -185,7 +185,7 @@ function(lite_cc_binary TARGET) ...@@ -185,7 +185,7 @@ function(lite_cc_binary TARGET)
endif() endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS PROFILE_DEPS HW_ASCEND_NPU_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps "") set(deps "")
...@@ -199,12 +199,12 @@ function(lite_cc_binary TARGET) ...@@ -199,12 +199,12 @@ function(lite_cc_binary TARGET)
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS} CV_DEPS ${CV_DEPS}
# MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
) )
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
...@@ -235,7 +235,7 @@ function(lite_cc_test TARGET) ...@@ -235,7 +235,7 @@ function(lite_cc_test TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS HW_ASCEND_NPU_DEPS XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS COMPILE_LEVEL # (basic|extra) ARGS COMPILE_LEVEL # (basic|extra)
) )
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -261,7 +261,7 @@ function(lite_cc_test TARGET) ...@@ -261,7 +261,7 @@ function(lite_cc_test TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS} CV_DEPS ${args_CV_DEPS}
# MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
) )
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size # strip binary target to reduce size
...@@ -309,9 +309,9 @@ endif() ...@@ -309,9 +309,9 @@ endif()
function(add_kernel TARGET device level) function(add_kernel TARGET device level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS HW_ASCEND_NPU_DEPS XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if(LITE_BUILD_TAILOR) if(LITE_BUILD_TAILOR)
...@@ -444,7 +444,7 @@ function(add_kernel TARGET device level) ...@@ -444,7 +444,7 @@ function(add_kernel TARGET device level)
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
#MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -463,9 +463,9 @@ endif() ...@@ -463,9 +463,9 @@ endif()
function(add_operator TARGET level) function(add_operator TARGET level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
...@@ -499,7 +499,7 @@ function(add_operator TARGET level) ...@@ -499,7 +499,7 @@ function(add_operator TARGET level)
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS} HW_ASCEND_NPU_DEPS ${args_HW_ASCEND_NPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
#MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
......
...@@ -69,7 +69,7 @@ if (WITH_TESTING) ...@@ -69,7 +69,7 @@ if (WITH_TESTING)
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels} HW_ASCEND_NPU_DEPS ${hw_ascend_npu_kernels}
# MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
) )
endif() endif()
if(LITE_WITH_FPGA) if(LITE_WITH_FPGA)
......
...@@ -2,12 +2,24 @@ if(NOT LITE_WITH_HW_ASCEND_NPU) ...@@ -2,12 +2,24 @@ if(NOT LITE_WITH_HW_ASCEND_NPU)
return() return()
endif() endif()
lite_cc_library(build_hw_ascend_npu SRCS build.cc DEPS
hw_ascend_npu_libs)
lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS
hw_ascend_npu_libs
build_hw_ascend_npu
)
lite_cc_library(target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS lite_cc_library(target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS
${hw_ascend_npu_runtime_libs}) hw_ascend_npu_libs
build_hw_ascend_npu
)
lite_cc_library(runtime_hw_ascend_npu SRCS runtime.cc DEPS lite_cc_library(runtime_hw_ascend_npu SRCS runtime.cc DEPS
${hw_ascend_npu_runtime_libs} hw_ascend_npu_libs
target_wrapper_hw_ascend_npu)
lite_cc_library(device_hw_ascend_npu SRCS device.cc DEPS
${hw_ascend_npu_runtime_libs}
target_wrapper_hw_ascend_npu target_wrapper_hw_ascend_npu
runtime_hw_ascend_npu) device_hw_ascend_npu
build_hw_ascend_npu
)
add_executable(test_build test_build.cc)
target_link_libraries(test_build build_hw_ascend_npu)
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <map> #include <map>
#include <string> #include <string>
#include "ge/ge_api_types.h" #include "ge/ge_api_types.h"
#include "lite/backends/hw_ascend_npu/runtime.h" #include "lite/backends/hw_ascend_npu/build.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
namespace paddle { namespace paddle {
...@@ -26,33 +26,92 @@ std::shared_ptr<HWAscendNPURuntime> Device::Build( ...@@ -26,33 +26,92 @@ std::shared_ptr<HWAscendNPURuntime> Device::Build(
std::vector<ge::Operator>& input_nodes, // NOLINT std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes // NOLINT std::vector<ge::Operator>& output_nodes // NOLINT
) { ) {
VLOG(3) << "[HWAscendNPU] Build model"; std::shared_ptr<ge::ModelBufferData> model_data =
// Build the IR graph to the om model paddle::lite::hw_ascend_npu::Build(input_nodes, output_nodes);
ge::Graph ir_graph("graph"); if (model_data == nullptr) {
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes); LOG(ERROR) << "[HWAscendNPU] Build model failed";
ge::ModelBufferData model;
std::map<std::string, std::string> build_options;
build_options.insert({ge::ir_option::EXEC_DISABLE_REUSED_MEMORY, "1"});
ge::graphStatus ret = aclgrphBuildModel(ir_graph, build_options, model);
if (ret != ge::GRAPH_SUCCESS) {
LOG(ERROR) << "[HWAscendNPU] Build model failed, error code: " << ret;
return nullptr; return nullptr;
} }
LOG(INFO) << "[HWAscendNPU] Build model success";
if (!inited_) {
if (0 == InitDevice()) {
LOG(INFO) << "Init success.";
inited_ = true;
}
}
std::shared_ptr<HWAscendNPURuntime> model_runtime( std::shared_ptr<HWAscendNPURuntime> model_runtime(
new HWAscendNPURuntime(model.data, model.length)); new HWAscendNPURuntime(model_data->data, model_data->length));
CHECK(model_runtime != nullptr); CHECK(model_runtime != nullptr);
if (!model_runtime->model_loaded()) { if (!model_runtime->model_loaded()) {
LOG(ERROR) << "[HWAscendNPU]: Can not create model runtime instance"; LOG(ERROR) << "[HWAscendNPU]: Can not create model runtime instance";
return nullptr; return nullptr;
} }
VLOG(3) << "[HWAscendNPU]: Build done"; LOG(INFO) << "[HWAscendNPU]: Build done";
return model_runtime; return model_runtime;
} }
int Device::InitDevice() {
const char* acl_conf = "/usr/local/acl.json";
aclError ret = aclInit(acl_conf);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] acl init failed";
return -1;
}
// open device
ret = aclrtSetDevice(device_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] acl open device " << device_id_ << " failed";
return -1;
}
ret = aclrtCreateContext(&context_ptr_, device_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "acl create context failed";
return -1;
}
// create stream
ret = aclrtCreateStream(&stream_ptr_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] acl create stream failed";
return -1;
}
// get run mode
aclrtGetRunMode runMode;
ret = aclrtGetMode(&runMode);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] acl get run mode failed";
return -1;
}
is_devcie_ = (runMode == ACL_DEVICE);
LOG(INFO) << "[HWAscendNPU] Hardware initialization done";
return 0;
}
void Device::ReleaseDevice() {
aclError ret;
if (stream_ptr_ != nullptr) {
ret = aclrtDestroyStream(stream_ptr_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] destroy stream failed";
}
stream_ptr_ = nullptr;
}
LOG(INFO) << "[HWAscendNPU] end to destroy stream";
if (context_ptr_ != nullptr) {
ret = aclrtDestroyContext(context_ptr_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU] destroy context failed";
}
context_ptr_ = nullptr;
}
LOG(INFO) << "[HWAscendNPU] Release device successfully";
}
} // namespace hw_ascend_npu } // namespace hw_ascend_npu
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
...@@ -18,8 +18,9 @@ ...@@ -18,8 +18,9 @@
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
#include "ge/ge_ir_build.h" // NOLINT #include "ge/ge_ir_build.h"
#include "lite/backends/hw_ascend_npu/runtime.h" #include "lite/backends/hw_ascend_npu/runtime.h"
#include "lite/utils/cp_logging.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
namespace hw_ascend_npu { namespace hw_ascend_npu {
...@@ -30,12 +31,11 @@ class Device { ...@@ -30,12 +31,11 @@ class Device {
static Device x; static Device x;
return x; return x;
} }
Device() {} Device() : inited_(false) {}
int freq_level() { return freq_level_; } ~Device() { ReleaseDevice(); }
int framework_type() { return framework_type_; }
int model_type() { return model_type_; } bool is_device() const { return is_devcie_; }
int device_type() { return device_type_; }
// Build the IR graph to om model, return a HWAscendNPURuntime instance to // Build the IR graph to om model, return a HWAscendNPURuntime instance to
// load om model and run inference. // load om model and run inference.
...@@ -45,10 +45,15 @@ class Device { ...@@ -45,10 +45,15 @@ class Device {
); // NOLINT ); // NOLINT
private: private:
int freq_level_{3}; int InitDevice();
int framework_type_{0}; void ReleaseDevice();
int model_type_{0};
int device_type_{0}; private:
bool inited_{false};
int device_id_{0};
bool is_devcie_{false};
aclrtContext context_ptr_{nullptr};
aclrtStream stream_ptr_{nullptr};
}; };
} // namespace hw_ascend_npu } // namespace hw_ascend_npu
......
...@@ -42,12 +42,15 @@ int HWAscendNPURuntime::LoadModelFromMem( ...@@ -42,12 +42,15 @@ int HWAscendNPURuntime::LoadModelFromMem(
&model_size_, &model_size_,
&model_weights_size_); &model_weights_size_);
if (ret != ACL_ERROR_NONE) { if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can query size from a built model buffer, " LOG(ERROR) << "[HWAscendNPU]: Can't query size from a built model buffer, "
"error code: " "error code: "
<< ret; << ret << ", model buffer size: " << model_buff_size;
return ret; return ret;
} }
LOG(INFO) << "[HWAscendNPU]: Query model info success, model_size: "
<< model_size_ << ", model weights_size_: " << model_weights_size_;
ret = aclrtMalloc(&model_ptr_, model_size_, ACL_MEM_MALLOC_NORMAL_ONLY); ret = aclrtMalloc(&model_ptr_, model_size_, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) { if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model, " LOG(ERROR) << "[HWAscendNPU]: Can not allocate a device memory for model, "
......
add_subdirectory(bridges) add_subdirectory(bridges)
add_kernel(subgraph_compute_hw_ascend_npu HW_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS add_kernel(subgraph_compute_hw_ascend_npu HW_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS
${lite_kernel_deps} build_hw_ascend_npu
device_hw_ascend_npu device_hw_ascend_npu
subgraph_bridge_engine subgraph_bridge_engine
runtime_hw_ascend_npu
${hw_ascend_npu_subgraph_bridges} ${hw_ascend_npu_subgraph_bridges}
subgraph_bridge_registry subgraph_bridge_registry
${lite_kernel_deps}
) )
...@@ -4,18 +4,24 @@ endif() ...@@ -4,18 +4,24 @@ endif()
lite_cc_library(subgraph_bridge_utility_hw_ascend_npu lite_cc_library(subgraph_bridge_utility_hw_ascend_npu
SRCS utility.cc SRCS utility.cc
DEPS ${hw_ascend_npu_builder_libs} tensor) DEPS hw_ascend_npu_libs tensor)
lite_cc_library(subgraph_bridge_graph_hw_ascend_npu lite_cc_library(subgraph_bridge_graph_hw_ascend_npu
SRCS graph.cc SRCS graph.cc
DEPS ${hw_ascend_npu_builder_libs} subgraph_bridge_utility_hw_ascend_npu DEPS hw_ascend_npu_libs subgraph_bridge_utility_hw_ascend_npu)
)
set(hw_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_hw_ascend_npu subgraph_bridge_graph_hw_ascend_npu)
set(hw_ascend_npu_subgraph_bridge_deps
subgraph_bridge_registry
subgraph_bridge_utility_hw_ascend_npu
subgraph_bridge_graph_hw_ascend_npu)
lite_cc_library(subgraph_bridge_act_op_hw_ascend_npu SRCS act_op.cc DEPS lite_cc_library(subgraph_bridge_act_op_hw_ascend_npu SRCS act_op.cc DEPS
${hw_ascend_npu_subgraph_bridge_deps} hw_ascend_npu_libs
${hw_ascend_npu_builder_libs}) ${hw_ascend_npu_subgraph_bridge_deps})
#lite_cc_library(subgraph_bridge_concat_op_hw_ascend_npu SRCS concat_op.cc DEPS
# ${hw_ascend_npu_subgraph_bridge_deps}
# hw_ascend_npu_builder_libs)
set(hw_ascend_npu_subgraph_bridges set(hw_ascend_npu_subgraph_bridges
subgraph_bridge_graph_hw_ascend_npu subgraph_bridge_graph_hw_ascend_npu
......
...@@ -30,7 +30,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { ...@@ -30,7 +30,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto scope = op->scope(); auto scope = op->scope();
VLOG(3) << "[HWAscendNPU] Converting " + op_type + "..."; LOG(INFO) << "[HWAscendNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes // Get input and output vars and op attributes
auto x_name = op_info->Input("X").front(); auto x_name = op_info->Input("X").front();
...@@ -64,28 +64,35 @@ int ActConverter<ge::op::Activation>(void* ctx, ...@@ -64,28 +64,35 @@ int ActConverter<ge::op::Activation>(void* ctx,
auto op_info = op->op_info(); auto op_info = op->op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
auto scope = op->scope(); auto scope = op->scope();
VLOG(3) << "[HWAscendNPU] Converting " + op_type + "..."; LOG(INFO) << "[HWAscendNPU] Converting " + op_type + "...";
// Get input and output vars and op attributes // Get input and output vars and op attributes
auto x_name = op_info->Input("X").front(); auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name); auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims(); auto x_dims = x->dims();
auto out_name = op_info->Output("Out").front(); auto out_name = op_info->Output("Out").front();
LOG(INFO) << "[HWAscendNPU] xname: " << x_name << ", dims: " << x_dims;
// X node // X node
std::shared_ptr<Node> x_node = nullptr; std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) { if (graph->Has(x_name)) {
LOG(INFO) << "[HWAscendNPU] graph has node: " << x_name;
x_node = graph->Get(x_name); x_node = graph->Get(x_name);
} else { } else {
LOG(INFO) << "[HWAscendNPU] graph does no have node: " << x_name;
x_node = graph->Add(x_name, *x); x_node = graph->Add(x_name, *x);
} }
LOG(INFO) << "[HWAscendNPU] out name: " << out_name;
#if 0
// Act node // Act node
auto act_node = graph->template Add<ge::op::Activation>(out_name); auto act_node = graph->template Add<ge::op::Activation>(out_name);
auto act_op = act_node->template data<ge::op::Activation>(); auto act_op = act_node->template data<ge::op::Activation>();
act_op->set_input_x(*x_node->data()); act_op->set_input_x(*x_node->data());
// TODO(hong19860320) set the coef value for act Ops, such as leaky_relu, // TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
// clipped_relu etc. // clipped_relu etc.
LOG(INFO) << "[HWAscendNPU] activation mode: " << op_type
<< ", type: " << CvtActMode(op_type);
act_op->set_attr_mode(CvtActMode(op_type)); act_op->set_attr_mode(CvtActMode(op_type));
if (op_type == "relu_clipped") { if (op_type == "relu_clipped") {
auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef"); auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
...@@ -94,6 +101,12 @@ int ActConverter<ge::op::Activation>(void* ctx, ...@@ -94,6 +101,12 @@ int ActConverter<ge::op::Activation>(void* ctx,
float Relu_clipped_coef = 6.f; float Relu_clipped_coef = 6.f;
act_op->set_attr_coef(Relu_clipped_coef); act_op->set_attr_coef(Relu_clipped_coef);
} }
#else
// Act node
auto act_node = graph->template Add<ge::op::Relu>(out_name);
auto act_op = act_node->template data<ge::op::Relu>();
act_op->set_input_x(*x_node->data());
#endif
return SUCCESS; return SUCCESS;
} }
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <all_ops.h>
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace hw_ascend_npu {
int ConcatConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HWAscendNPU] Converting " << op_type << " ... ";
// Get input and output vars and op attributes
auto x_names = op_info->Input("X");
auto out_name = op_info->Output("Out").front();
auto axis = op_info->GetAttr<int>("axis");
auto num = x_names.size();
// Traverse all of input nodes which are added into the new created concat
// node
auto concat_node = graph->Add<ge::op::Concat>(out_name);
auto concat_op = concat_node->data<ge::op::Concat>();
concat_op->set_input_concat_dim(axis);
concat_op->set_attr_N(num);
concat_op->create_dynamic_input_input_values(num);
int idx = 1;
for (auto& x_name : x_names) {
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
concat_op->set_dynamic_input_input_values(idx, *x_node->data());
idx++;
}
return SUCCESS;
}
} // namespace hw_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(
concat,
kHWAscendNPU,
paddle::lite::subgraph::hw_ascend_npu::ConcatConverter);
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace hw_ascend_npu {
int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HwAscendNPU] Converting " << op_type << "... ";
// Get input and output vars and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
auto input_dims = input->dims();
auto filter_name = op_info->Input("Filter").front();
auto filter = scope->FindMutableTensor(filter_name);
auto filter_dims = filter->dims();
auto output_name = op_info->Output("Output").front();
auto output = scope->FindMutableTensor(output_name);
auto output_dims = output->dims();
auto bs = input_dims[0];
auto ic = input_dims[1];
auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4L);
CHECK_EQ(output_dims.size(), 4L);
CHECK_EQ(filter_dims.size(), 4L);
CHECK_EQ(output_dims[0], bs);
CHECK_EQ(output_dims[1], oc);
auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto groups = op_info->GetAttr<int>("groups");
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
bool with_act =
op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
std::string act_type =
with_act ? op_info->GetAttr<std::string>("act_type") : "";
float leaky_relu_alpha = act_type == "leaky_relu"
? op_info->GetAttr<float>("leaky_relu_alpha")
: 0.f;
CHECK_EQ(strides.size(), 2L);
CHECK_EQ(dilations.size(), 2L);
// Input node
std::shared_ptr<Node> input_node = nullptr;
if (graph->Has(input_name)) {
input_node = graph->Get(input_name);
} else {
input_node = graph->Add(input_name, *input);
}
if (paddings.size() == 2L) {
for (size_t i = 0; i < strides.size(); ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
CHECK_EQ(paddings.size(), 4L) << "[HwAscendNPU] Paddings size should be the "
"same or twice as the input size.";
std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) {
padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
}
operators::UpdatePaddingAndDilation(&paddings,
&dilations,
strides,
padding_algorithm,
input_dims,
filter_dims);
// Check depthwise mode, and decide whether use ConvolutionDepthwise Op
bool use_depthwise_conv =
false; // Whether use ge::op::ConvolutionDepthwise ?
bool is_depthwise_mode = ic == groups && oc == groups;
if (is_depthwise_mode &&
!((groups == 1 || groups >= 5) && dilations[0] == 1 &&
dilations[1] == 1)) {
use_depthwise_conv = true;
LOG(WARNING)
<< "[HwAscendNPU] For depthwise mode, dilation = 1 and groups >= 5 "
"(or groups = 1) is only supported in Convolution Op, so "
"force to use ConvolutionDepthwise Op, but may lead poor "
"performance.";
}
// Filter node
auto filter_node = graph->Add(filter_name, *filter);
// Add bias node if exists bias
// Supports the bias nodes with the following dimensions
// 0: {oc}
// 1: {1, oc, oh, ow}
// 2: {n, oc, oh, ow}
std::shared_ptr<Node> bias_node = nullptr;
bool is_channel_bias = false;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
if (graph->Has(bias_name)) {
bias_node = graph->Get(bias_name);
} else {
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
auto bias_data_size = bias_dims.production();
auto output_data_size = output_dims.production();
std::vector<int64_t> bias_shape;
if (bias_data_size == oc) {
// 0: {oc}
bias_shape = {1, oc, 1, 1};
is_channel_bias = true;
} else if (bias_data_size == output_data_size / bs) {
// 1: {1, oc, oh, ow}
bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
} else if (bias_data_size == output_data_size) {
// 2: {n, oc, oh, ow}
bias_shape = output_dims.Vectorize();
} else {
LOG(WARNING)
<< "[HwAscendNPU] Bias dimension " << bias_dims
<< " isn't supported in conv2d Op when output dimension is "
<< output_dims;
return FAILED;
}
bias_node = graph->Add(bias_name, *bias, bias_shape);
}
}
// Conv node
std::shared_ptr<Node> conv_node = nullptr;
if (use_depthwise_conv && is_depthwise_mode) {
conv_node = graph->Add<ge::op::DepthwiseConv2D>(output_name);
auto conv_op = conv_node->data<ge::op::ConvolutionDepthwise>();
conv_op->set_input_x(*input_node->data());
conv_op->set_input_filter(*filter_node->data());
conv_op->set_attr_mode(1);
conv_op->set_attr_algo(0);
conv_op->set_attr_format(0); // NCHW
conv_op->set_attr_pad_mode(5); // VALID
conv_op->set_attr_group(groups);
conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
{paddings[0], paddings[1], paddings[2], paddings[3]}));
conv_op->set_attr_dilation(
ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
conv_op->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
// ConvolutionDepthwise Op doesn't support bias, so append Add node to
// support bias
if (bias_node != nullptr) {
auto add_node = graph->Add<ge::op::Add>(output_name);
auto add_op = add_node->data<ge::op::Add>();
add_op->set_input_x1(*conv_node->data());
add_op->set_input_x2(*bias_node->data());
conv_node = add_node;
}
} else {
conv_node = graph->Add<ge::op::Convolution>(output_name);
auto conv_op = conv_node->data<ge::op::Convolution>();
conv_op->set_input_x(*input_node->data());
conv_op->set_input_w(*filter_node->data());
conv_op->set_attr_mode(1);
// when padding_algorithm=="SAME", NPU is different from lite
if (padding_algorithm == "VALID") {
conv_op->set_attr_pad_mode(5);
} else {
conv_op->set_attr_pad_mode(0);
}
conv_op->set_attr_group(groups);
conv_op->set_attr_pad(ge::AttrValue::LIST_INT(
{paddings[0], paddings[1], paddings[2], paddings[3]}));
conv_op->set_attr_dilation(
ge::AttrValue::LIST_INT({dilations[0], dilations[1]}));
conv_op->set_attr_stride(ge::AttrValue::LIST_INT({strides[0], strides[1]}));
conv_op->set_attr_kernel(
ge::AttrValue::LIST_INT({filter_dims[2], filter_dims[3]}));
// Convolution Op only support bias with dimension {1, oc, 1, 1},
// so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
if (bias_node != nullptr) {
if (is_channel_bias) {
conv_op->set_input_b(*bias_node->data());
} else {
auto add_node = graph->Add<ge::op::Add>(output_name);
auto add_op = add_node->data<ge::op::Add>();
add_op->set_input_x1(*conv_node->data());
add_op->set_input_x2(*bias_node->data());
conv_node = add_node;
}
}
}
CHECK(conv_node);
if (!act_type.empty()) {
auto act_node = graph->Add<ge::op::Activation>(output_name);
auto act_op = act_node->data<ge::op::Activation>();
act_op->set_input_x(*conv_node->data());
act_op->set_attr_mode(CvtActMode(act_type));
if (act_type == "leaky_relu") {
act_op->set_attr_negative_slope(leaky_relu_alpha);
} else if (act_type == "relu6") {
act_op->set_attr_coef(6.f);
}
}
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace hw_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(conv2d,
kHWAscendNPU,
paddle::lite::subgraph::npu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d,
kHWAscendNPU,
paddle::lite::subgraph::npu::ConvConverter);
...@@ -27,7 +27,8 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) { ...@@ -27,7 +27,8 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
if (it != nodes_.end()) { if (it != nodes_.end()) {
// Only variable node can be shared with the same name // Only variable node can be shared with the same name
if (!node->is_var() || !it->second.back()->is_var()) { if (!node->is_var() || !it->second.back()->is_var()) {
LOG(FATAL) << "[NPU] Const or data node " << name << " is redefined."; LOG(FATAL) << "[HWAscendNPU] Const or data node " << name
<< " is redefined.";
return -1; return -1;
} }
} else { } else {
...@@ -65,6 +66,13 @@ std::shared_ptr<Node> Graph::Add(const std::string& name, ...@@ -65,6 +66,13 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
PrecisionType precision, PrecisionType precision,
DataLayoutType layout) { DataLayoutType layout) {
auto node = Add<ge::op::Data>(name, precision, layout); auto node = Add<ge::op::Data>(name, precision, layout);
std::stringstream iss;
iss << "[HWAscendNPU] Add data node, shape: ";
for (auto& s : shape) {
iss << s << ",";
}
iss << " name: " << name;
LOG(INFO) << iss.str();
ge::TensorDesc desc( ge::TensorDesc desc(
ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision)); ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
node->data<ge::op::Data>()->update_input_desc_data(desc); node->data<ge::op::Data>()->update_input_desc_data(desc);
......
...@@ -181,13 +181,14 @@ class Graph { ...@@ -181,13 +181,14 @@ class Graph {
} }
std::shared_ptr<Node> Get(std::string name) { std::shared_ptr<Node> Get(std::string name) {
CHECK(Has(name)) << "[NPU] Node " << name << " not found."; CHECK(Has(name)) << "[HWAscendNPU] Node " << name << " not found.";
return nodes_.at(name).back(); return nodes_.at(name).back();
} }
bool Has(const std::string& name) { bool Has(const std::string& name) {
return nodes_.find(name) != nodes_.end(); return nodes_.find(name) != nodes_.end();
} }
size_t size() const { return nodes_.size(); }
private: private:
std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_; std::unordered_map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
......
...@@ -47,7 +47,7 @@ ge::Tensor CvtTensor(const Tensor& in_tensor, ...@@ -47,7 +47,7 @@ ge::Tensor CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape = {}, std::vector<int64_t> out_shape = {},
DataLayoutType in_layout = DATALAYOUT(kNCHW)); DataLayoutType in_layout = DATALAYOUT(kNCHW));
int CvtActMode(std::string act_type); int CvtActMode(const std::string& act_type);
} // namespace hw_ascend_npu } // namespace hw_ascend_npu
} // namespace subgraph } // namespace subgraph
} // namespace lite } // namespace lite
......
...@@ -32,22 +32,31 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -32,22 +32,31 @@ int SubgraphEngine::BuildDeviceProgram() {
// the HWAscendNPU IR graph // the HWAscendNPU IR graph
subgraph::hw_ascend_npu::Graph graph; subgraph::hw_ascend_npu::Graph graph;
const auto& bridges = subgraph::Registry::Instance(); const auto& bridges = subgraph::Registry::Instance();
LOG(INFO) << "[HWAscendNPU] Build device program";
for (auto& inst : origin_program_) { for (auto& inst : origin_program_) {
auto op = const_cast<OpLite*>(inst.op()); auto op = const_cast<OpLite*>(inst.op());
CHECK(op); CHECK(op);
op->CheckShape(); op->CheckShape();
op->InferShape(); op->InferShape();
std::string op_type = op->op_info()->Type(); std::string op_type = op->op_info()->Type();
LOG(INFO) << "[HWAscendNPU] trying to convert OP: " << op_type;
if (!bridges.Exists(op_type, TARGET(kHWAscendNPU))) { if (!bridges.Exists(op_type, TARGET(kHWAscendNPU))) {
LOG(ERROR) << "[HWAscendNPU] OP: " << op_type
<< " does not exist for target HWAscendNPU";
return subgraph::FAILED; return subgraph::FAILED;
} }
LOG(INFO) << "[HWAscendNPU] OP: " << op_type << " exists for HWAscendNPU";
auto kernel = inst.kernel(); auto kernel = inst.kernel();
status |= bridges.Select(op_type, TARGET(kHWAscendNPU))( status |= bridges.Select(op_type, TARGET(kHWAscendNPU))(
reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel)); reinterpret_cast<void*>(&graph), op, const_cast<KernelBase*>(kernel));
if (subgraph::CHECK_FAILED(status)) { if (subgraph::CHECK_FAILED(status)) {
LOG(ERROR) << "[HWAscendNPU] OP: " << op_type << " select kernel failed";
return subgraph::FAILED; return subgraph::FAILED;
} }
LOG(INFO) << "[HWAscendNPU] OP: " << op_type
<< " select kernel for HWAscendNPU";
} }
LOG(INFO) << "[HWAscendNPU] Graph size: " << graph.size();
// Collect the valid input and output nodes in the HiAI IR graph and update // Collect the valid input and output nodes in the HiAI IR graph and update
// the input and output names // the input and output names
device_inames_.clear(); device_inames_.clear();
...@@ -55,8 +64,12 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -55,8 +64,12 @@ int SubgraphEngine::BuildDeviceProgram() {
std::vector<ge::Operator> device_inodes; std::vector<ge::Operator> device_inodes;
std::vector<ge::Operator> device_onodes; std::vector<ge::Operator> device_onodes;
for (auto& input_name : input_names_) { for (auto& input_name : input_names_) {
LOG(INFO) << "[HWAscendNPU] input name: " << input_name;
if (graph.Has(input_name)) { if (graph.Has(input_name)) {
LOG(INFO) << "[HWAscendNPU] Graph has input name: " << input_name;
if (graph.Get(input_name)->is_data()) { if (graph.Get(input_name)->is_data()) {
LOG(INFO) << "[HWAscendNPU] the current input name: " << input_name
<< " is data";
device_inodes.push_back(*graph.Get(input_name)->data()); device_inodes.push_back(*graph.Get(input_name)->data());
device_inames_.push_back(input_name); device_inames_.push_back(input_name);
} else { } else {
...@@ -82,10 +95,15 @@ int SubgraphEngine::BuildDeviceProgram() { ...@@ -82,10 +95,15 @@ int SubgraphEngine::BuildDeviceProgram() {
CHECK(!device_onames_.empty()) CHECK(!device_onames_.empty())
<< "[HWAscendNPU] No output nodes found for building NPU model"; << "[HWAscendNPU] No output nodes found for building NPU model";
LOG(INFO) << "[HWAscendNPU] Graph size to build: " << graph.size();
// Build the IR graph to om model as the device program // Build the IR graph to om model as the device program
if (device_program_map_.count(inputs_shape_) > 0) { if (device_program_map_.count(inputs_shape_) > 0) {
return status; return status;
} }
LOG(INFO) << "[HWAscendNPU] Start to build, device_inodes = "
<< device_inodes.size()
<< ", device_onodes = " << device_onodes.size();
auto device_client = auto device_client =
lite::hw_ascend_npu::Device::Global().Build(device_inodes, device_onodes); lite::hw_ascend_npu::Device::Global().Build(device_inodes, device_onodes);
if (device_client == nullptr) { if (device_client == nullptr) {
...@@ -188,12 +206,14 @@ int SubgraphEngine::LaunchDeviceProgram() { ...@@ -188,12 +206,14 @@ int SubgraphEngine::LaunchDeviceProgram() {
// tensors // tensors
auto device_program = device_program_map_[inputs_shape_]; auto device_program = device_program_map_[inputs_shape_];
int ret = 0; int ret = 0;
LOG(INFO) << "[HWAscendNPU] start to set input...";
ret = device_program->client->SetInput(origin_itensors_, ret = device_program->client->SetInput(origin_itensors_,
device_program->origin_idims); device_program->origin_idims);
if (ret != 0) { if (ret != 0) {
return ret; return ret;
} }
LOG(INFO) << "[HWAscendNPU] start to create output...";
device_program->client->CreateOutput(device_program->origin_odims); device_program->client->CreateOutput(device_program->origin_odims);
...@@ -205,10 +225,11 @@ int SubgraphEngine::LaunchDeviceProgram() { ...@@ -205,10 +225,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
}; };
auto start_time = GetCurrentUS(); auto start_time = GetCurrentUS();
CHECK_EQ(device_program->client->Process(), 0); CHECK_EQ(device_program->client->Process(), 0);
VLOG(3) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time LOG(INFO) << "[HWAscendNPU] Process cost " << GetCurrentUS() - start_time
<< " us"; << " us";
device_program->client->GetOutput(&origin_otensors_); device_program->client->GetOutput(&origin_otensors_);
LOG(INFO) << "[HWAscendNPU] Get ouput done";
return 0; return 0;
} }
...@@ -238,7 +259,9 @@ void SubgraphCompute::PrepareForRun() { ...@@ -238,7 +259,9 @@ void SubgraphCompute::PrepareForRun() {
} }
void SubgraphCompute::Run() { void SubgraphCompute::Run() {
LOG(INFO) << "[HWAscendNPU] Start to run";
CHECK(engine_); CHECK(engine_);
LOG(INFO) << "[HWAscendNPU] Start to call Launch";
engine_->Launch(); engine_->Launch();
} }
......
...@@ -105,15 +105,19 @@ bool Engine::InputShapeChanged() { ...@@ -105,15 +105,19 @@ bool Engine::InputShapeChanged() {
} }
int Engine::Launch() { int Engine::Launch() {
LOG(INFO) << "[HWAscendNPU] in Launch, start to build if needed";
// Rebuild device program when the shapes of input tensors have been changed. // Rebuild device program when the shapes of input tensors have been changed.
if (CHECK_SUCCESS(build_device_program_status_) && if (CHECK_SUCCESS(build_device_program_status_) &&
CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) && CHECK_REBUILD_WHEN_SHAPE_CHANGED(build_device_program_status_) &&
InputShapeChanged()) { InputShapeChanged()) {
Build(); Build();
} }
LOG(INFO) << "[HWAscendNPU] launch program";
if (CHECK_FAILED(build_device_program_status_)) { if (CHECK_FAILED(build_device_program_status_)) {
LOG(INFO) << "[HWAscendNPU] launch original program";
LaunchOriginProgram(); LaunchOriginProgram();
} else { } else {
LOG(INFO) << "[HWAscendNPU] launch device program";
LaunchDeviceProgram(); LaunchDeviceProgram();
} }
return 0; return 0;
......
...@@ -9,7 +9,8 @@ set (kernels ...@@ -9,7 +9,8 @@ set (kernels
${host_kernels} ${host_kernels}
${hw_ascend_npu_kernels}) ${hw_ascend_npu_kernels})
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_HW_ASCEND_NPU) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) message(STATUS "======---------------------------------=================${hw_ascend_npu_kernels}")
if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels}) lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})
lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels}) lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})
lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels}) lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels} ${hw_ascend_npu_kernels})
......
...@@ -293,6 +293,7 @@ TEST(Activation_relu, precision) { ...@@ -293,6 +293,7 @@ TEST(Activation_relu, precision) {
place = TARGET(kXPU); place = TARGET(kXPU);
#elif defined(LITE_WITH_HW_ASCEND_NPU) #elif defined(LITE_WITH_HW_ASCEND_NPU)
place = TARGET(kHWAscendNPU); place = TARGET(kHWAscendNPU);
std::cout << "-----------test relu with hw_ascend_npu" << std::endl;
#else #else
return; return;
#endif #endif
......
...@@ -3,7 +3,7 @@ set -ex ...@@ -3,7 +3,7 @@ set -ex
# global variables with default value # global variables with default value
ASCEND_HOME="/usr/local/Ascend" # Ascend SDK root directory ASCEND_HOME="/usr/local/Ascend" # Ascend SDK root directory
TARGET_NAME="test_subgraph_pass" # default target TARGET_NAME="test_kernel_activation_compute" # default target
BUILD_EXTRA=ON # ON(with sequence ops)/OFF BUILD_EXTRA=ON # ON(with sequence ops)/OFF
WITH_TESTING=ON # ON/OFF WITH_TESTING=ON # ON/OFF
...@@ -80,7 +80,7 @@ function build_hw_ascend_npu { ...@@ -80,7 +80,7 @@ function build_hw_ascend_npu {
-DWITH_TESTING=${WITH_TESTING} \ -DWITH_TESTING=${WITH_TESTING} \
-DASCEND_HOME=${HW_ASCEND_NPU_SDK_ROOT} -DASCEND_HOME=${HW_ASCEND_NPU_SDK_ROOT}
make -j$NUM_CORES_FOR_COMPILE make $TARGET_NAME -j2
cd - cd -
echo "Done" echo "Done"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册