Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
5cab7cdd
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
338
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5cab7cdd
编写于
5月 29, 2020
作者:
Y
yanghongtian
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
collect all
上级
f6cf1f9f
变更
20
隐藏空白更改
内联
并排
Showing
20 changed file
with
603 addition
and
99 deletion
+603
-99
cmake/device/hw_ascend_npu.cmake
cmake/device/hw_ascend_npu.cmake
+85
-31
cmake/lite.cmake
cmake/lite.cmake
+16
-16
lite/api/CMakeLists.txt
lite/api/CMakeLists.txt
+1
-1
lite/backends/hw_ascend_npu/CMakeLists.txt
lite/backends/hw_ascend_npu/CMakeLists.txt
+18
-6
lite/backends/hw_ascend_npu/device.cc
lite/backends/hw_ascend_npu/device.cc
+75
-16
lite/backends/hw_ascend_npu/device.h
lite/backends/hw_ascend_npu/device.h
+15
-10
lite/backends/hw_ascend_npu/runtime.cc
lite/backends/hw_ascend_npu/runtime.cc
+5
-2
lite/kernels/hw_ascend_npu/CMakeLists.txt
lite/kernels/hw_ascend_npu/CMakeLists.txt
+3
-1
lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt
lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt
+12
-6
lite/kernels/hw_ascend_npu/bridges/act_op.cc
lite/kernels/hw_ascend_npu/bridges/act_op.cc
+15
-2
lite/kernels/hw_ascend_npu/bridges/concat_op.cc
lite/kernels/hw_ascend_npu/bridges/concat_op.cc
+70
-0
lite/kernels/hw_ascend_npu/bridges/conv_op.cc
lite/kernels/hw_ascend_npu/bridges/conv_op.cc
+242
-0
lite/kernels/hw_ascend_npu/bridges/graph.cc
lite/kernels/hw_ascend_npu/bridges/graph.cc
+9
-1
lite/kernels/hw_ascend_npu/bridges/graph.h
lite/kernels/hw_ascend_npu/bridges/graph.h
+2
-1
lite/kernels/hw_ascend_npu/bridges/utility.h
lite/kernels/hw_ascend_npu/bridges/utility.h
+1
-1
lite/kernels/hw_ascend_npu/subgraph_compute.cc
lite/kernels/hw_ascend_npu/subgraph_compute.cc
+25
-2
lite/kernels/npu/bridges/engine.cc
lite/kernels/npu/bridges/engine.cc
+4
-0
lite/tests/kernels/CMakeLists.txt
lite/tests/kernels/CMakeLists.txt
+2
-1
lite/tests/kernels/activation_compute_test.cc
lite/tests/kernels/activation_compute_test.cc
+1
-0
lite/tools/build_hw_ascend_npu.sh
lite/tools/build_hw_ascend_npu.sh
+2
-2
未找到文件。
cmake/device/hw_ascend_npu.cmake
浏览文件 @
5cab7cdd
...
@@ -24,33 +24,6 @@ if(NOT DEFINED ASCEND_HOME)
...
@@ -24,33 +24,6 @@ if(NOT DEFINED ASCEND_HOME)
endif
()
endif
()
message
(
STATUS
"LITE_WITH_HW_ASCEND_NPU:
${
LITE_WITH_HW_ASCEND_NPU
}
"
)
message
(
STATUS
"LITE_WITH_HW_ASCEND_NPU:
${
LITE_WITH_HW_ASCEND_NPU
}
"
)
find_path
(
ACL_INC NAMES acl/acl.h
PATHS
${
ASCEND_HOME
}
/acllib/include NO_DEFAULT_PATH
)
if
(
NOT ACL_INC
)
message
(
FATAL_ERROR
"Can not find acl/acl.h in
${
ASCEND_HOME
}
/include"
)
endif
()
include_directories
(
"
${
ACL_INC
}
"
)
set
(
ACL_LIB_FILES
acl_dvpp
ascendcl
register
runtime
)
foreach
(
libname
${
ACL_LIB_FILES
}
)
find_library
(
lib_name_path_
${
libname
}
NAMES
${
libname
}
PATHS
${
ASCEND_HOME
}
/acllib/lib64
)
if
(
lib_name_path_
${
libname
}
)
add_library
(
acl_
${
libname
}
SHARED IMPORTED GLOBAL
)
set_property
(
TARGET acl_
${
libname
}
PROPERTY IMPORTED_LOCATION
${
lib_name_path_
${
libname
}}
)
list
(
APPEND acl_libs acl_
${
libname
}
)
else
()
message
(
FATAL_ERROR
"can not find library:
${
libname
}
"
)
endif
()
endforeach
()
set
(
hw_ascend_npu_runtime_libs
${
acl_libs
}
CACHE INTERNAL
"ascend runtime libs"
)
# find atc include folder and library
# find atc include folder and library
find_path
(
ATC_INC NAMES ge/ge_ir_build.h
find_path
(
ATC_INC NAMES ge/ge_ir_build.h
...
@@ -61,6 +34,8 @@ endif()
...
@@ -61,6 +34,8 @@ endif()
include_directories
(
"
${
ATC_INC
}
"
)
include_directories
(
"
${
ATC_INC
}
"
)
set
(
ATC_LIB_FILES
set
(
ATC_LIB_FILES
ge_compiler
graph
_caffe_parser
_caffe_parser
auto_tiling
auto_tiling
c_sec
c_sec
...
@@ -76,9 +51,7 @@ set(ATC_LIB_FILES
...
@@ -76,9 +51,7 @@ set(ATC_LIB_FILES
fmk_tensorflow_parser
fmk_tensorflow_parser
ge_client
ge_client
ge_common
ge_common
ge_compiler
ge_executor
ge_executor
graph
mmpa
mmpa
msprof
msprof
parser_common
parser_common
...
@@ -92,6 +65,16 @@ set(ATC_LIB_FILES
...
@@ -92,6 +65,16 @@ set(ATC_LIB_FILES
tvm_runtime
tvm_runtime
tvm_topi
tvm_topi
)
)
set
(
ATC_PLUGIN_NNENGIN_LIB_FILES
engine
)
set
(
ATC_PLUGIN_OPSKERNEL_LIB_FILES
aicpu_engine
fe
ge_local_engine
rts_engine
)
foreach
(
libname
${
ATC_LIB_FILES
}
)
foreach
(
libname
${
ATC_LIB_FILES
}
)
find_library
(
lib_name_path_
${
libname
}
NAMES
${
libname
}
PATHS
${
ASCEND_HOME
}
/atc/lib64
)
find_library
(
lib_name_path_
${
libname
}
NAMES
${
libname
}
PATHS
${
ASCEND_HOME
}
/atc/lib64
)
...
@@ -104,6 +87,28 @@ foreach (libname ${ATC_LIB_FILES})
...
@@ -104,6 +87,28 @@ foreach (libname ${ATC_LIB_FILES})
endif
()
endif
()
endforeach
()
endforeach
()
foreach
(
libname
${
ATC_PLUGIN_NNENGIN_LIB_FILES
}
)
find_library
(
lib_name_path_
${
libname
}
NAMES
${
libname
}
PATHS
${
ASCEND_HOME
}
/atc/lib64/plugin/nnengine
)
if
(
lib_name_path_
${
libname
}
)
add_library
(
atc_
${
libname
}
SHARED IMPORTED GLOBAL
)
set_property
(
TARGET atc_
${
libname
}
PROPERTY IMPORTED_LOCATION
${
lib_name_path_
${
libname
}}
)
list
(
APPEND atc_libs atc_
${
libname
}
)
else
()
message
(
FATAL_ERROR
"can not find library:
${
libname
}
"
)
endif
()
endforeach
()
foreach
(
libname
${
ATC_PLUGIN_OPSKERNEL_LIB_FILES
}
)
find_library
(
lib_name_path_
${
libname
}
NAMES
${
libname
}
PATHS
${
ASCEND_HOME
}
/atc/lib64/plugin/opskernel
)
if
(
lib_name_path_
${
libname
}
)
add_library
(
atc_
${
libname
}
SHARED IMPORTED GLOBAL
)
set_property
(
TARGET atc_
${
libname
}
PROPERTY IMPORTED_LOCATION
${
lib_name_path_
${
libname
}}
)
list
(
APPEND atc_libs atc_
${
libname
}
)
else
()
message
(
FATAL_ERROR
"can not find library:
${
libname
}
"
)
endif
()
endforeach
()
# find opp include folder and library
# find opp include folder and library
find_path
(
OPP_INC NAMES all_ops.h
find_path
(
OPP_INC NAMES all_ops.h
PATHS
${
ASCEND_HOME
}
/opp/op_proto/built-in/inc
)
PATHS
${
ASCEND_HOME
}
/opp/op_proto/built-in/inc
)
...
@@ -139,10 +144,59 @@ else()
...
@@ -139,10 +144,59 @@ else()
set_property
(
TARGET opp_fusion_pass_vectorcore_lib PROPERTY IMPORTED_LOCATION
${
OPP_FUSION_VECTORCORE
}
)
set_property
(
TARGET opp_fusion_pass_vectorcore_lib PROPERTY IMPORTED_LOCATION
${
OPP_FUSION_VECTORCORE
}
)
endif
()
endif
()
set
(
hw_ascend_npu_builder_libs
add_library
(
hw_ascend_npu_builder_libs INTERFACE
)
target_link_libraries
(
hw_ascend_npu_builder_libs INTERFACE
${
atc_libs
}
opp_opsproto_lib
opp_fusion_pass_aicore_lib
opp_fusion_pass_vectorcore_lib
)
#set(hw_ascend_npu_builder_libs
# ${atc_libs}
# opp_opsproto_lib
# opp_fusion_pass_aicore_lib
# opp_fusion_pass_vectorcore_lib
# CACHE INTERNAL "ascend builder libs")
# find ascend cl runtime library
find_path
(
ACL_INC NAMES acl/acl.h
PATHS
${
ASCEND_HOME
}
/acllib/include NO_DEFAULT_PATH
)
if
(
NOT ACL_INC
)
message
(
FATAL_ERROR
"Can not find acl/acl.h in
${
ASCEND_HOME
}
/include"
)
endif
()
include_directories
(
"
${
ACL_INC
}
"
)
set
(
ACL_LIB_FILES
acl_dvpp
ascendcl
register
runtime
)
foreach
(
libname
${
ACL_LIB_FILES
}
)
find_library
(
lib_name_path_
${
libname
}
NAMES
${
libname
}
PATHS
${
ASCEND_HOME
}
/acllib/lib64
)
if
(
lib_name_path_
${
libname
}
)
add_library
(
acl_
${
libname
}
SHARED IMPORTED GLOBAL
)
set_property
(
TARGET acl_
${
libname
}
PROPERTY IMPORTED_LOCATION
${
lib_name_path_
${
libname
}}
)
list
(
APPEND acl_libs acl_
${
libname
}
)
else
()
message
(
FATAL_ERROR
"can not find library:
${
libname
}
"
)
endif
()
endforeach
()
add_library
(
hw_ascend_npu_runtime_libs INTERFACE
)
target_link_libraries
(
hw_ascend_npu_runtime_libs INTERFACE
${
acl_libs
}
)
add_library
(
hw_ascend_npu_libs INTERFACE
)
target_link_libraries
(
hw_ascend_npu_libs INTERFACE
${
atc_libs
}
${
atc_libs
}
opp_opsproto_lib
opp_opsproto_lib
opp_fusion_pass_aicore_lib
opp_fusion_pass_aicore_lib
opp_fusion_pass_vectorcore_lib
opp_fusion_pass_vectorcore_lib
CACHE INTERNAL
"ascend builder libs"
)
${
acl_libs
}
)
# set(hw_ascend_npu_runtime_libs ${acl_libs} CACHE INTERNAL "ascend runtime libs")
cmake/lite.cmake
浏览文件 @
5cab7cdd
...
@@ -23,7 +23,7 @@ function (lite_deps TARGET)
...
@@ -23,7 +23,7 @@ function (lite_deps TARGET)
set
(
options
""
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS
set
(
multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS
CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS ARGS
)
CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS
MLU_DEPS
HW_ASCEND_NPU_DEPS CV_DEPS ARGS
)
cmake_parse_arguments
(
lite_deps
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
lite_deps
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
deps
${
lite_deps_DEPS
}
)
set
(
deps
${
lite_deps_DEPS
}
)
...
@@ -138,7 +138,7 @@ function(lite_cc_library TARGET)
...
@@ -138,7 +138,7 @@ function(lite_cc_library TARGET)
set
(
options SHARED shared STATIC static MODULE module
)
set
(
options SHARED shared STATIC static MODULE module
)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS
)
XPU_DEPS
MLU_DEPS
HW_ASCEND_NPU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
deps
""
)
set
(
deps
""
)
...
@@ -156,7 +156,7 @@ function(lite_cc_library TARGET)
...
@@ -156,7 +156,7 @@ function(lite_cc_library TARGET)
PROFILE_DEPS
${
args_PROFILE_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
#
MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS
${
args_MLU_DEPS
}
HW_ASCEND_NPU_DEPS
${
args_HW_ASCEND_NPU_DEPS
}
HW_ASCEND_NPU_DEPS
${
args_HW_ASCEND_NPU_DEPS
}
)
)
...
@@ -185,7 +185,7 @@ function(lite_cc_binary TARGET)
...
@@ -185,7 +185,7 @@ function(lite_cc_binary TARGET)
endif
()
endif
()
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS
PROFILE_DEPS HW_ASCEND_NPU
_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS
)
XPU_DEPS
MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE
_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
set
(
deps
""
)
set
(
deps
""
)
...
@@ -199,12 +199,12 @@ function(lite_cc_binary TARGET)
...
@@ -199,12 +199,12 @@ function(lite_cc_binary TARGET)
NPU_DEPS
${
args_NPU_DEPS
}
NPU_DEPS
${
args_NPU_DEPS
}
XPU_DEPS
${
args_XPU_DEPS
}
XPU_DEPS
${
args_XPU_DEPS
}
HW_ASCEND_NPU_DEPS
${
args_HW_ASCEND_NPU_DEPS
}
HW_ASCEND_NPU_DEPS
${
args_HW_ASCEND_NPU_DEPS
}
BM_DEPS
${
args_BM_DEPS
}
BM_DEPS
${
args_BM_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
CV_DEPS
${
CV_DEPS
}
CV_DEPS
${
CV_DEPS
}
#
MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS
${
args_MLU_DEPS
}
)
)
cc_binary
(
${
TARGET
}
SRCS
${
args_SRCS
}
DEPS
${
deps
}
)
cc_binary
(
${
TARGET
}
SRCS
${
args_SRCS
}
DEPS
${
deps
}
)
target_compile_options
(
${
TARGET
}
BEFORE PRIVATE -Wno-ignored-qualifiers
)
target_compile_options
(
${
TARGET
}
BEFORE PRIVATE -Wno-ignored-qualifiers
)
...
@@ -235,7 +235,7 @@ function(lite_cc_test TARGET)
...
@@ -235,7 +235,7 @@ function(lite_cc_test TARGET)
set
(
options
""
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS
PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS HW_ASCEND_NPU
_DEPS
XPU_DEPS
HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV
_DEPS
ARGS COMPILE_LEVEL
# (basic|extra)
ARGS COMPILE_LEVEL
# (basic|extra)
)
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
...
@@ -261,7 +261,7 @@ function(lite_cc_test TARGET)
...
@@ -261,7 +261,7 @@ function(lite_cc_test TARGET)
LIGHT_DEPS
${
args_LIGHT_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
CV_DEPS
${
args_CV_DEPS
}
CV_DEPS
${
args_CV_DEPS
}
#
MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS
${
args_MLU_DEPS
}
)
)
_lite_cc_test
(
${
TARGET
}
SRCS
${
args_SRCS
}
DEPS
${
deps
}
ARGS
${
args_ARGS
}
)
_lite_cc_test
(
${
TARGET
}
SRCS
${
args_SRCS
}
DEPS
${
deps
}
ARGS
${
args_ARGS
}
)
# strip binary target to reduce size
# strip binary target to reduce size
...
@@ -309,9 +309,9 @@ endif()
...
@@ -309,9 +309,9 @@ endif()
function
(
add_kernel TARGET device level
)
function
(
add_kernel TARGET device level
)
set
(
options
""
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS PROFILE_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS HW_ASCEND_NPU
_DEPS
XPU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE
_DEPS
ARGS
)
ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
if
(
LITE_BUILD_TAILOR
)
if
(
LITE_BUILD_TAILOR
)
...
@@ -444,7 +444,7 @@ function(add_kernel TARGET device level)
...
@@ -444,7 +444,7 @@ function(add_kernel TARGET device level)
XPU_DEPS
${
args_XPU_DEPS
}
XPU_DEPS
${
args_XPU_DEPS
}
HW_ASCEND_NPU_DEPS
${
args_HW_ASCEND_NPU_DEPS
}
HW_ASCEND_NPU_DEPS
${
args_HW_ASCEND_NPU_DEPS
}
BM_DEPS
${
args_BM_DEPS
}
BM_DEPS
${
args_BM_DEPS
}
#
MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS
${
args_MLU_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
...
@@ -463,9 +463,9 @@ endif()
...
@@ -463,9 +463,9 @@ endif()
function
(
add_operator TARGET level
)
function
(
add_operator TARGET level
)
set
(
options
""
)
set
(
options
""
)
set
(
oneValueArgs
""
)
set
(
oneValueArgs
""
)
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
XPU_DEPS PROFILE_DEPS
set
(
multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
XPU_DEPS MLU_DEPS HW_ASCEND_NPU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS
)
ARGS
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
cmake_parse_arguments
(
args
"
${
options
}
"
"
${
oneValueArgs
}
"
"
${
multiValueArgs
}
"
${
ARGN
}
)
if
(
"
${
level
}
"
STREQUAL
"extra"
AND
(
NOT LITE_BUILD_EXTRA
))
if
(
"
${
level
}
"
STREQUAL
"extra"
AND
(
NOT LITE_BUILD_EXTRA
))
...
@@ -499,7 +499,7 @@ function(add_operator TARGET level)
...
@@ -499,7 +499,7 @@ function(add_operator TARGET level)
XPU_DEPS
${
args_XPU_DEPS
}
XPU_DEPS
${
args_XPU_DEPS
}
HW_ASCEND_NPU_DEPS
${
args_HW_ASCEND_NPU_DEPS
}
HW_ASCEND_NPU_DEPS
${
args_HW_ASCEND_NPU_DEPS
}
BM_DEPS
${
args_BM_DEPS
}
BM_DEPS
${
args_BM_DEPS
}
#
MLU_DEPS ${args_MLU_DEPS}
MLU_DEPS
${
args_MLU_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
PROFILE_DEPS
${
args_PROFILE_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
LIGHT_DEPS
${
args_LIGHT_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
HVY_DEPS
${
args_HVY_DEPS
}
...
...
lite/api/CMakeLists.txt
浏览文件 @
5cab7cdd
...
@@ -69,7 +69,7 @@ if (WITH_TESTING)
...
@@ -69,7 +69,7 @@ if (WITH_TESTING)
XPU_DEPS
${
xpu_kernels
}
XPU_DEPS
${
xpu_kernels
}
BM_DEPS
${
bm_kernels
}
BM_DEPS
${
bm_kernels
}
HW_ASCEND_NPU_DEPS
${
hw_ascend_npu_kernels
}
HW_ASCEND_NPU_DEPS
${
hw_ascend_npu_kernels
}
#
MLU_DEPS ${mlu_kernels}
MLU_DEPS
${
mlu_kernels
}
)
)
endif
()
endif
()
if
(
LITE_WITH_FPGA
)
if
(
LITE_WITH_FPGA
)
...
...
lite/backends/hw_ascend_npu/CMakeLists.txt
浏览文件 @
5cab7cdd
...
@@ -2,12 +2,24 @@ if(NOT LITE_WITH_HW_ASCEND_NPU)
...
@@ -2,12 +2,24 @@ if(NOT LITE_WITH_HW_ASCEND_NPU)
return
()
return
()
endif
()
endif
()
lite_cc_library
(
build_hw_ascend_npu SRCS build.cc DEPS
hw_ascend_npu_libs
)
lite_cc_library
(
device_hw_ascend_npu SRCS device.cc DEPS
hw_ascend_npu_libs
build_hw_ascend_npu
)
lite_cc_library
(
target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS
lite_cc_library
(
target_wrapper_hw_ascend_npu SRCS target_wrapper.cc DEPS
${
hw_ascend_npu_runtime_libs
}
)
hw_ascend_npu_libs
build_hw_ascend_npu
)
lite_cc_library
(
runtime_hw_ascend_npu SRCS runtime.cc DEPS
lite_cc_library
(
runtime_hw_ascend_npu SRCS runtime.cc DEPS
${
hw_ascend_npu_runtime_libs
}
hw_ascend_npu_libs
target_wrapper_hw_ascend_npu
)
lite_cc_library
(
device_hw_ascend_npu SRCS device.cc DEPS
${
hw_ascend_npu_runtime_libs
}
target_wrapper_hw_ascend_npu
target_wrapper_hw_ascend_npu
runtime_hw_ascend_npu
)
device_hw_ascend_npu
build_hw_ascend_npu
)
add_executable
(
test_build test_build.cc
)
target_link_libraries
(
test_build build_hw_ascend_npu
)
lite/backends/hw_ascend_npu/device.cc
浏览文件 @
5cab7cdd
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
#include <map>
#include <map>
#include <string>
#include <string>
#include "ge/ge_api_types.h"
#include "ge/ge_api_types.h"
#include "lite/backends/hw_ascend_npu/
runtime
.h"
#include "lite/backends/hw_ascend_npu/
build
.h"
#include "lite/utils/cp_logging.h"
#include "lite/utils/cp_logging.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -26,33 +26,92 @@ std::shared_ptr<HWAscendNPURuntime> Device::Build(
...
@@ -26,33 +26,92 @@ std::shared_ptr<HWAscendNPURuntime> Device::Build(
std
::
vector
<
ge
::
Operator
>&
input_nodes
,
// NOLINT
std
::
vector
<
ge
::
Operator
>&
input_nodes
,
// NOLINT
std
::
vector
<
ge
::
Operator
>&
output_nodes
// NOLINT
std
::
vector
<
ge
::
Operator
>&
output_nodes
// NOLINT
)
{
)
{
VLOG
(
3
)
<<
"[HWAscendNPU] Build model"
;
std
::
shared_ptr
<
ge
::
ModelBufferData
>
model_data
=
// Build the IR graph to the om model
paddle
::
lite
::
hw_ascend_npu
::
Build
(
input_nodes
,
output_nodes
);
ge
::
Graph
ir_graph
(
"graph"
);
if
(
model_data
==
nullptr
)
{
ir_graph
.
SetInputs
(
input_nodes
).
SetOutputs
(
output_nodes
);
LOG
(
ERROR
)
<<
"[HWAscendNPU] Build model failed"
;
ge
::
ModelBufferData
model
;
std
::
map
<
std
::
string
,
std
::
string
>
build_options
;
build_options
.
insert
({
ge
::
ir_option
::
EXEC_DISABLE_REUSED_MEMORY
,
"1"
});
ge
::
graphStatus
ret
=
aclgrphBuildModel
(
ir_graph
,
build_options
,
model
);
if
(
ret
!=
ge
::
GRAPH_SUCCESS
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] Build model failed, error code: "
<<
ret
;
return
nullptr
;
return
nullptr
;
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU] Build model success"
;
if
(
!
inited_
)
{
if
(
0
==
InitDevice
())
{
LOG
(
INFO
)
<<
"Init success."
;
inited_
=
true
;
}
}
std
::
shared_ptr
<
HWAscendNPURuntime
>
model_runtime
(
std
::
shared_ptr
<
HWAscendNPURuntime
>
model_runtime
(
new
HWAscendNPURuntime
(
model
.
data
,
model
.
length
));
new
HWAscendNPURuntime
(
model
_data
->
data
,
model_data
->
length
));
CHECK
(
model_runtime
!=
nullptr
);
CHECK
(
model_runtime
!=
nullptr
);
if
(
!
model_runtime
->
model_loaded
())
{
if
(
!
model_runtime
->
model_loaded
())
{
LOG
(
ERROR
)
<<
"[HWAscendNPU]: Can not create model runtime instance"
;
LOG
(
ERROR
)
<<
"[HWAscendNPU]: Can not create model runtime instance"
;
return
nullptr
;
return
nullptr
;
}
}
VLOG
(
3
)
<<
"[HWAscendNPU]: Build done"
;
LOG
(
INFO
)
<<
"[HWAscendNPU]: Build done"
;
return
model_runtime
;
return
model_runtime
;
}
}
int
Device
::
InitDevice
()
{
const
char
*
acl_conf
=
"/usr/local/acl.json"
;
aclError
ret
=
aclInit
(
acl_conf
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] acl init failed"
;
return
-
1
;
}
// open device
ret
=
aclrtSetDevice
(
device_id_
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] acl open device "
<<
device_id_
<<
" failed"
;
return
-
1
;
}
ret
=
aclrtCreateContext
(
&
context_ptr_
,
device_id_
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"acl create context failed"
;
return
-
1
;
}
// create stream
ret
=
aclrtCreateStream
(
&
stream_ptr_
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] acl create stream failed"
;
return
-
1
;
}
// get run mode
aclrtGetRunMode
runMode
;
ret
=
aclrtGetMode
(
&
runMode
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] acl get run mode failed"
;
return
-
1
;
}
is_devcie_
=
(
runMode
==
ACL_DEVICE
);
LOG
(
INFO
)
<<
"[HWAscendNPU] Hardware initialization done"
;
return
0
;
}
void
Device
::
ReleaseDevice
()
{
aclError
ret
;
if
(
stream_ptr_
!=
nullptr
)
{
ret
=
aclrtDestroyStream
(
stream_ptr_
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] destroy stream failed"
;
}
stream_ptr_
=
nullptr
;
}
LOG
(
INFO
)
<<
"[HWAscendNPU] end to destroy stream"
;
if
(
context_ptr_
!=
nullptr
)
{
ret
=
aclrtDestroyContext
(
context_ptr_
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] destroy context failed"
;
}
context_ptr_
=
nullptr
;
}
LOG
(
INFO
)
<<
"[HWAscendNPU] Release device successfully"
;
}
}
// namespace hw_ascend_npu
}
// namespace hw_ascend_npu
}
// namespace lite
}
// namespace lite
}
// namespace paddle
}
// namespace paddle
lite/backends/hw_ascend_npu/device.h
浏览文件 @
5cab7cdd
...
@@ -18,8 +18,9 @@
...
@@ -18,8 +18,9 @@
#include <string>
#include <string>
#include <unordered_map>
#include <unordered_map>
#include <vector>
#include <vector>
#include "ge/ge_ir_build.h"
// NOLINT
#include "ge/ge_ir_build.h"
#include "lite/backends/hw_ascend_npu/runtime.h"
#include "lite/backends/hw_ascend_npu/runtime.h"
#include "lite/utils/cp_logging.h"
namespace
paddle
{
namespace
paddle
{
namespace
lite
{
namespace
lite
{
namespace
hw_ascend_npu
{
namespace
hw_ascend_npu
{
...
@@ -30,12 +31,11 @@ class Device {
...
@@ -30,12 +31,11 @@ class Device {
static
Device
x
;
static
Device
x
;
return
x
;
return
x
;
}
}
Device
()
{}
Device
()
:
inited_
(
false
)
{}
int
freq_level
()
{
return
freq_level_
;
}
~
Device
()
{
ReleaseDevice
();
}
int
framework_type
()
{
return
framework_type_
;
}
int
model_type
()
{
return
model_type_
;
}
bool
is_device
()
const
{
return
is_devcie_
;
}
int
device_type
()
{
return
device_type_
;
}
// Build the IR graph to om model, return a HWAscendNPURuntime instance to
// Build the IR graph to om model, return a HWAscendNPURuntime instance to
// load om model and run inference.
// load om model and run inference.
...
@@ -45,10 +45,15 @@ class Device {
...
@@ -45,10 +45,15 @@ class Device {
);
// NOLINT
);
// NOLINT
private:
private:
int
freq_level_
{
3
};
int
InitDevice
();
int
framework_type_
{
0
};
void
ReleaseDevice
();
int
model_type_
{
0
};
int
device_type_
{
0
};
private:
bool
inited_
{
false
};
int
device_id_
{
0
};
bool
is_devcie_
{
false
};
aclrtContext
context_ptr_
{
nullptr
};
aclrtStream
stream_ptr_
{
nullptr
};
};
};
}
// namespace hw_ascend_npu
}
// namespace hw_ascend_npu
...
...
lite/backends/hw_ascend_npu/runtime.cc
浏览文件 @
5cab7cdd
...
@@ -42,12 +42,15 @@ int HWAscendNPURuntime::LoadModelFromMem(
...
@@ -42,12 +42,15 @@ int HWAscendNPURuntime::LoadModelFromMem(
&
model_size_
,
&
model_size_
,
&
model_weights_size_
);
&
model_weights_size_
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU]: Can query size from a built model buffer, "
LOG
(
ERROR
)
<<
"[HWAscendNPU]: Can
't
query size from a built model buffer, "
"error code: "
"error code: "
<<
ret
;
<<
ret
<<
", model buffer size: "
<<
model_buff_size
;
return
ret
;
return
ret
;
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU]: Query model info success, model_size: "
<<
model_size_
<<
", model weights_size_: "
<<
model_weights_size_
;
ret
=
aclrtMalloc
(
&
model_ptr_
,
model_size_
,
ACL_MEM_MALLOC_NORMAL_ONLY
);
ret
=
aclrtMalloc
(
&
model_ptr_
,
model_size_
,
ACL_MEM_MALLOC_NORMAL_ONLY
);
if
(
ret
!=
ACL_ERROR_NONE
)
{
if
(
ret
!=
ACL_ERROR_NONE
)
{
LOG
(
ERROR
)
<<
"[HWAscendNPU]: Can not allocate a device memory for model, "
LOG
(
ERROR
)
<<
"[HWAscendNPU]: Can not allocate a device memory for model, "
...
...
lite/kernels/hw_ascend_npu/CMakeLists.txt
浏览文件 @
5cab7cdd
add_subdirectory
(
bridges
)
add_subdirectory
(
bridges
)
add_kernel
(
subgraph_compute_hw_ascend_npu HW_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS
add_kernel
(
subgraph_compute_hw_ascend_npu HW_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS
${
lite_kernel_deps
}
build_hw_ascend_npu
device_hw_ascend_npu
device_hw_ascend_npu
subgraph_bridge_engine
subgraph_bridge_engine
runtime_hw_ascend_npu
${
hw_ascend_npu_subgraph_bridges
}
${
hw_ascend_npu_subgraph_bridges
}
subgraph_bridge_registry
subgraph_bridge_registry
${
lite_kernel_deps
}
)
)
lite/kernels/hw_ascend_npu/bridges/CMakeLists.txt
浏览文件 @
5cab7cdd
...
@@ -4,18 +4,24 @@ endif()
...
@@ -4,18 +4,24 @@ endif()
lite_cc_library
(
subgraph_bridge_utility_hw_ascend_npu
lite_cc_library
(
subgraph_bridge_utility_hw_ascend_npu
SRCS utility.cc
SRCS utility.cc
DEPS
${
hw_ascend_npu_builder_libs
}
tensor
)
DEPS
hw_ascend_npu_libs
tensor
)
lite_cc_library
(
subgraph_bridge_graph_hw_ascend_npu
lite_cc_library
(
subgraph_bridge_graph_hw_ascend_npu
SRCS graph.cc
SRCS graph.cc
DEPS
${
hw_ascend_npu_builder_libs
}
subgraph_bridge_utility_hw_ascend_npu
DEPS hw_ascend_npu_libs subgraph_bridge_utility_hw_ascend_npu
)
)
set
(
hw_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_hw_ascend_npu subgraph_bridge_graph_hw_ascend_npu
)
set
(
hw_ascend_npu_subgraph_bridge_deps
subgraph_bridge_registry
subgraph_bridge_utility_hw_ascend_npu
subgraph_bridge_graph_hw_ascend_npu
)
lite_cc_library
(
subgraph_bridge_act_op_hw_ascend_npu SRCS act_op.cc DEPS
lite_cc_library
(
subgraph_bridge_act_op_hw_ascend_npu SRCS act_op.cc DEPS
${
hw_ascend_npu_subgraph_bridge_deps
}
hw_ascend_npu_libs
${
hw_ascend_npu_builder_libs
}
)
${
hw_ascend_npu_subgraph_bridge_deps
}
)
#lite_cc_library(subgraph_bridge_concat_op_hw_ascend_npu SRCS concat_op.cc DEPS
# ${hw_ascend_npu_subgraph_bridge_deps}
# hw_ascend_npu_builder_libs)
set
(
hw_ascend_npu_subgraph_bridges
set
(
hw_ascend_npu_subgraph_bridges
subgraph_bridge_graph_hw_ascend_npu
subgraph_bridge_graph_hw_ascend_npu
...
...
lite/kernels/hw_ascend_npu/bridges/act_op.cc
浏览文件 @
5cab7cdd
...
@@ -30,7 +30,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
...
@@ -30,7 +30,7 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
auto
op_info
=
op
->
op_info
();
auto
op_info
=
op
->
op_info
();
auto
op_type
=
op_info
->
Type
();
auto
op_type
=
op_info
->
Type
();
auto
scope
=
op
->
scope
();
auto
scope
=
op
->
scope
();
VLOG
(
3
)
<<
"[HWAscendNPU] Converting "
+
op_type
+
"..."
;
LOG
(
INFO
)
<<
"[HWAscendNPU] Converting "
+
op_type
+
"..."
;
// Get input and output vars and op attributes
// Get input and output vars and op attributes
auto
x_name
=
op_info
->
Input
(
"X"
).
front
();
auto
x_name
=
op_info
->
Input
(
"X"
).
front
();
...
@@ -64,28 +64,35 @@ int ActConverter<ge::op::Activation>(void* ctx,
...
@@ -64,28 +64,35 @@ int ActConverter<ge::op::Activation>(void* ctx,
auto
op_info
=
op
->
op_info
();
auto
op_info
=
op
->
op_info
();
auto
op_type
=
op_info
->
Type
();
auto
op_type
=
op_info
->
Type
();
auto
scope
=
op
->
scope
();
auto
scope
=
op
->
scope
();
VLOG
(
3
)
<<
"[HWAscendNPU] Converting "
+
op_type
+
"..."
;
LOG
(
INFO
)
<<
"[HWAscendNPU] Converting "
+
op_type
+
"..."
;
// Get input and output vars and op attributes
// Get input and output vars and op attributes
auto
x_name
=
op_info
->
Input
(
"X"
).
front
();
auto
x_name
=
op_info
->
Input
(
"X"
).
front
();
auto
x
=
scope
->
FindMutableTensor
(
x_name
);
auto
x
=
scope
->
FindMutableTensor
(
x_name
);
auto
x_dims
=
x
->
dims
();
auto
x_dims
=
x
->
dims
();
auto
out_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
out_name
=
op_info
->
Output
(
"Out"
).
front
();
LOG
(
INFO
)
<<
"[HWAscendNPU] xname: "
<<
x_name
<<
", dims: "
<<
x_dims
;
// X node
// X node
std
::
shared_ptr
<
Node
>
x_node
=
nullptr
;
std
::
shared_ptr
<
Node
>
x_node
=
nullptr
;
if
(
graph
->
Has
(
x_name
))
{
if
(
graph
->
Has
(
x_name
))
{
LOG
(
INFO
)
<<
"[HWAscendNPU] graph has node: "
<<
x_name
;
x_node
=
graph
->
Get
(
x_name
);
x_node
=
graph
->
Get
(
x_name
);
}
else
{
}
else
{
LOG
(
INFO
)
<<
"[HWAscendNPU] graph does no have node: "
<<
x_name
;
x_node
=
graph
->
Add
(
x_name
,
*
x
);
x_node
=
graph
->
Add
(
x_name
,
*
x
);
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU] out name: "
<<
out_name
;
#if 0
// Act node
// Act node
auto act_node = graph->template Add<ge::op::Activation>(out_name);
auto act_node = graph->template Add<ge::op::Activation>(out_name);
auto act_op = act_node->template data<ge::op::Activation>();
auto act_op = act_node->template data<ge::op::Activation>();
act_op->set_input_x(*x_node->data());
act_op->set_input_x(*x_node->data());
// TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
// TODO(hong19860320) set the coef value for act Ops, such as leaky_relu,
// clipped_relu etc.
// clipped_relu etc.
LOG(INFO) << "[HWAscendNPU] activation mode: " << op_type
<< ", type: " << CvtActMode(op_type);
act_op->set_attr_mode(CvtActMode(op_type));
act_op->set_attr_mode(CvtActMode(op_type));
if (op_type == "relu_clipped") {
if (op_type == "relu_clipped") {
auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
auto Relu_clipped_coef = op_info->GetAttr<float>("Relu_clipped_coef");
...
@@ -94,6 +101,12 @@ int ActConverter<ge::op::Activation>(void* ctx,
...
@@ -94,6 +101,12 @@ int ActConverter<ge::op::Activation>(void* ctx,
float Relu_clipped_coef = 6.f;
float Relu_clipped_coef = 6.f;
act_op->set_attr_coef(Relu_clipped_coef);
act_op->set_attr_coef(Relu_clipped_coef);
}
}
#else
// Act node
auto
act_node
=
graph
->
template
Add
<
ge
::
op
::
Relu
>(
out_name
);
auto
act_op
=
act_node
->
template
data
<
ge
::
op
::
Relu
>();
act_op
->
set_input_x
(
*
x_node
->
data
());
#endif
return
SUCCESS
;
return
SUCCESS
;
}
}
...
...
lite/kernels/hw_ascend_npu/bridges/concat_op.cc
0 → 100644
浏览文件 @
5cab7cdd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <all_ops.h>
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
hw_ascend_npu
{
int
ConcatConverter
(
void
*
ctx
,
OpLite
*
op
,
KernelBase
*
kernel
)
{
CHECK
(
ctx
!=
nullptr
);
CHECK
(
op
!=
nullptr
);
auto
graph
=
static_cast
<
Graph
*>
(
ctx
);
auto
op_info
=
op
->
op_info
();
auto
op_type
=
op_info
->
Type
();
auto
scope
=
op
->
scope
();
VLOG
(
3
)
<<
"[HWAscendNPU] Converting "
<<
op_type
<<
" ... "
;
// Get input and output vars and op attributes
auto
x_names
=
op_info
->
Input
(
"X"
);
auto
out_name
=
op_info
->
Output
(
"Out"
).
front
();
auto
axis
=
op_info
->
GetAttr
<
int
>
(
"axis"
);
auto
num
=
x_names
.
size
();
// Traverse all of input nodes which are added into the new created concat
// node
auto
concat_node
=
graph
->
Add
<
ge
::
op
::
Concat
>
(
out_name
);
auto
concat_op
=
concat_node
->
data
<
ge
::
op
::
Concat
>
();
concat_op
->
set_input_concat_dim
(
axis
);
concat_op
->
set_attr_N
(
num
);
concat_op
->
create_dynamic_input_input_values
(
num
);
int
idx
=
1
;
for
(
auto
&
x_name
:
x_names
)
{
auto
x
=
scope
->
FindMutableTensor
(
x_name
);
auto
x_dims
=
x
->
dims
();
std
::
shared_ptr
<
Node
>
x_node
=
nullptr
;
if
(
graph
->
Has
(
x_name
))
{
x_node
=
graph
->
Get
(
x_name
);
}
else
{
x_node
=
graph
->
Add
(
x_name
,
*
x
);
}
concat_op
->
set_dynamic_input_input_values
(
idx
,
*
x_node
->
data
());
idx
++
;
}
return
SUCCESS
;
}
}
// namespace hw_ascend_npu
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
concat
,
kHWAscendNPU
,
paddle
::
lite
::
subgraph
::
hw_ascend_npu
::
ConcatConverter
);
lite/kernels/hw_ascend_npu/bridges/conv_op.cc
0 → 100644
浏览文件 @
5cab7cdd
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include "lite/kernels/hw_ascend_npu/bridges/graph.h"
#include "lite/kernels/hw_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace
paddle
{
namespace
lite
{
namespace
subgraph
{
namespace
hw_ascend_npu
{
int
ConvConverter
(
void
*
ctx
,
OpLite
*
op
,
KernelBase
*
kernel
)
{
CHECK
(
ctx
!=
nullptr
);
CHECK
(
op
!=
nullptr
);
auto
graph
=
static_cast
<
Graph
*>
(
ctx
);
auto
op_info
=
op
->
op_info
();
auto
op_type
=
op_info
->
Type
();
auto
scope
=
op
->
scope
();
VLOG
(
3
)
<<
"[HwAscendNPU] Converting "
<<
op_type
<<
"... "
;
// Get input and output vars and op attributes
auto
input_name
=
op_info
->
Input
(
"Input"
).
front
();
auto
input
=
scope
->
FindMutableTensor
(
input_name
);
auto
input_dims
=
input
->
dims
();
auto
filter_name
=
op_info
->
Input
(
"Filter"
).
front
();
auto
filter
=
scope
->
FindMutableTensor
(
filter_name
);
auto
filter_dims
=
filter
->
dims
();
auto
output_name
=
op_info
->
Output
(
"Output"
).
front
();
auto
output
=
scope
->
FindMutableTensor
(
output_name
);
auto
output_dims
=
output
->
dims
();
auto
bs
=
input_dims
[
0
];
auto
ic
=
input_dims
[
1
];
auto
oc
=
filter_dims
[
0
];
CHECK_EQ
(
input_dims
.
size
(),
4L
);
CHECK_EQ
(
output_dims
.
size
(),
4L
);
CHECK_EQ
(
filter_dims
.
size
(),
4L
);
CHECK_EQ
(
output_dims
[
0
],
bs
);
CHECK_EQ
(
output_dims
[
1
],
oc
);
auto
strides
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"strides"
);
auto
paddings
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"paddings"
);
auto
groups
=
op_info
->
GetAttr
<
int
>
(
"groups"
);
auto
dilations
=
op_info
->
GetAttr
<
std
::
vector
<
int
>>
(
"dilations"
);
bool
with_act
=
op_info
->
HasAttr
(
"with_act"
)
&&
op_info
->
GetAttr
<
bool
>
(
"with_act"
);
std
::
string
act_type
=
with_act
?
op_info
->
GetAttr
<
std
::
string
>
(
"act_type"
)
:
""
;
float
leaky_relu_alpha
=
act_type
==
"leaky_relu"
?
op_info
->
GetAttr
<
float
>
(
"leaky_relu_alpha"
)
:
0.
f
;
CHECK_EQ
(
strides
.
size
(),
2L
);
CHECK_EQ
(
dilations
.
size
(),
2L
);
// Input node
std
::
shared_ptr
<
Node
>
input_node
=
nullptr
;
if
(
graph
->
Has
(
input_name
))
{
input_node
=
graph
->
Get
(
input_name
);
}
else
{
input_node
=
graph
->
Add
(
input_name
,
*
input
);
}
if
(
paddings
.
size
()
==
2L
)
{
for
(
size_t
i
=
0
;
i
<
strides
.
size
();
++
i
)
{
int
copy_pad
=
*
(
paddings
.
begin
()
+
2
*
i
);
paddings
.
insert
(
paddings
.
begin
()
+
2
*
i
+
1
,
copy_pad
);
}
}
CHECK_EQ
(
paddings
.
size
(),
4L
)
<<
"[HwAscendNPU] Paddings size should be the "
"same or twice as the input size."
;
std
::
string
padding_algorithm
(
""
);
if
(
op_info
->
HasAttr
(
"padding_algorithm"
))
{
padding_algorithm
=
op_info
->
GetAttr
<
std
::
string
>
(
"padding_algorithm"
);
}
operators
::
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
strides
,
padding_algorithm
,
input_dims
,
filter_dims
);
// Check depthwise mode, and decide whether use ConvolutionDepthwise Op
bool
use_depthwise_conv
=
false
;
// Whether use ge::op::ConvolutionDepthwise ?
bool
is_depthwise_mode
=
ic
==
groups
&&
oc
==
groups
;
if
(
is_depthwise_mode
&&
!
((
groups
==
1
||
groups
>=
5
)
&&
dilations
[
0
]
==
1
&&
dilations
[
1
]
==
1
))
{
use_depthwise_conv
=
true
;
LOG
(
WARNING
)
<<
"[HwAscendNPU] For depthwise mode, dilation = 1 and groups >= 5 "
"(or groups = 1) is only supported in Convolution Op, so "
"force to use ConvolutionDepthwise Op, but may lead poor "
"performance."
;
}
// Filter node
auto
filter_node
=
graph
->
Add
(
filter_name
,
*
filter
);
// Add bias node if exists bias
// Supports the bias nodes with the following dimensions
// 0: {oc}
// 1: {1, oc, oh, ow}
// 2: {n, oc, oh, ow}
std
::
shared_ptr
<
Node
>
bias_node
=
nullptr
;
bool
is_channel_bias
=
false
;
if
(
HasInputArg
(
op_info
,
scope
,
"Bias"
))
{
auto
bias_name
=
op_info
->
Input
(
"Bias"
).
front
();
if
(
graph
->
Has
(
bias_name
))
{
bias_node
=
graph
->
Get
(
bias_name
);
}
else
{
auto
bias
=
scope
->
FindMutableTensor
(
bias_name
);
auto
bias_dims
=
bias
->
dims
();
auto
bias_data_size
=
bias_dims
.
production
();
auto
output_data_size
=
output_dims
.
production
();
std
::
vector
<
int64_t
>
bias_shape
;
if
(
bias_data_size
==
oc
)
{
// 0: {oc}
bias_shape
=
{
1
,
oc
,
1
,
1
};
is_channel_bias
=
true
;
}
else
if
(
bias_data_size
==
output_data_size
/
bs
)
{
// 1: {1, oc, oh, ow}
bias_shape
=
{
1
,
output_dims
[
1
],
output_dims
[
2
],
output_dims
[
3
]};
}
else
if
(
bias_data_size
==
output_data_size
)
{
// 2: {n, oc, oh, ow}
bias_shape
=
output_dims
.
Vectorize
();
}
else
{
LOG
(
WARNING
)
<<
"[HwAscendNPU] Bias dimension "
<<
bias_dims
<<
" isn't supported in conv2d Op when output dimension is "
<<
output_dims
;
return
FAILED
;
}
bias_node
=
graph
->
Add
(
bias_name
,
*
bias
,
bias_shape
);
}
}
// Conv node
std
::
shared_ptr
<
Node
>
conv_node
=
nullptr
;
if
(
use_depthwise_conv
&&
is_depthwise_mode
)
{
conv_node
=
graph
->
Add
<
ge
::
op
::
DepthwiseConv2D
>
(
output_name
);
auto
conv_op
=
conv_node
->
data
<
ge
::
op
::
ConvolutionDepthwise
>
();
conv_op
->
set_input_x
(
*
input_node
->
data
());
conv_op
->
set_input_filter
(
*
filter_node
->
data
());
conv_op
->
set_attr_mode
(
1
);
conv_op
->
set_attr_algo
(
0
);
conv_op
->
set_attr_format
(
0
);
// NCHW
conv_op
->
set_attr_pad_mode
(
5
);
// VALID
conv_op
->
set_attr_group
(
groups
);
conv_op
->
set_attr_pad
(
ge
::
AttrValue
::
LIST_INT
(
{
paddings
[
0
],
paddings
[
1
],
paddings
[
2
],
paddings
[
3
]}));
conv_op
->
set_attr_dilation
(
ge
::
AttrValue
::
LIST_INT
({
dilations
[
0
],
dilations
[
1
]}));
conv_op
->
set_attr_stride
(
ge
::
AttrValue
::
LIST_INT
({
strides
[
0
],
strides
[
1
]}));
conv_op
->
set_attr_kernel
(
ge
::
AttrValue
::
LIST_INT
({
filter_dims
[
2
],
filter_dims
[
3
]}));
// ConvolutionDepthwise Op doesn't support bias, so append Add node to
// support bias
if
(
bias_node
!=
nullptr
)
{
auto
add_node
=
graph
->
Add
<
ge
::
op
::
Add
>
(
output_name
);
auto
add_op
=
add_node
->
data
<
ge
::
op
::
Add
>
();
add_op
->
set_input_x1
(
*
conv_node
->
data
());
add_op
->
set_input_x2
(
*
bias_node
->
data
());
conv_node
=
add_node
;
}
}
else
{
conv_node
=
graph
->
Add
<
ge
::
op
::
Convolution
>
(
output_name
);
auto
conv_op
=
conv_node
->
data
<
ge
::
op
::
Convolution
>
();
conv_op
->
set_input_x
(
*
input_node
->
data
());
conv_op
->
set_input_w
(
*
filter_node
->
data
());
conv_op
->
set_attr_mode
(
1
);
// when padding_algorithm=="SAME", NPU is different from lite
if
(
padding_algorithm
==
"VALID"
)
{
conv_op
->
set_attr_pad_mode
(
5
);
}
else
{
conv_op
->
set_attr_pad_mode
(
0
);
}
conv_op
->
set_attr_group
(
groups
);
conv_op
->
set_attr_pad
(
ge
::
AttrValue
::
LIST_INT
(
{
paddings
[
0
],
paddings
[
1
],
paddings
[
2
],
paddings
[
3
]}));
conv_op
->
set_attr_dilation
(
ge
::
AttrValue
::
LIST_INT
({
dilations
[
0
],
dilations
[
1
]}));
conv_op
->
set_attr_stride
(
ge
::
AttrValue
::
LIST_INT
({
strides
[
0
],
strides
[
1
]}));
conv_op
->
set_attr_kernel
(
ge
::
AttrValue
::
LIST_INT
({
filter_dims
[
2
],
filter_dims
[
3
]}));
// Convolution Op only support bias with dimension {1, oc, 1, 1},
// so append Add node if dimension is {1, oc, oh, ow} or (n, oc, oh, ow)
if
(
bias_node
!=
nullptr
)
{
if
(
is_channel_bias
)
{
conv_op
->
set_input_b
(
*
bias_node
->
data
());
}
else
{
auto
add_node
=
graph
->
Add
<
ge
::
op
::
Add
>
(
output_name
);
auto
add_op
=
add_node
->
data
<
ge
::
op
::
Add
>
();
add_op
->
set_input_x1
(
*
conv_node
->
data
());
add_op
->
set_input_x2
(
*
bias_node
->
data
());
conv_node
=
add_node
;
}
}
}
CHECK
(
conv_node
);
if
(
!
act_type
.
empty
())
{
auto
act_node
=
graph
->
Add
<
ge
::
op
::
Activation
>
(
output_name
);
auto
act_op
=
act_node
->
data
<
ge
::
op
::
Activation
>
();
act_op
->
set_input_x
(
*
conv_node
->
data
());
act_op
->
set_attr_mode
(
CvtActMode
(
act_type
));
if
(
act_type
==
"leaky_relu"
)
{
act_op
->
set_attr_negative_slope
(
leaky_relu_alpha
);
}
else
if
(
act_type
==
"relu6"
)
{
act_op
->
set_attr_coef
(
6.
f
);
}
}
return
REBUILD_WHEN_SHAPE_CHANGED
;
}
}
// namespace hw_ascend_npu
}
// namespace subgraph
}
// namespace lite
}
// namespace paddle
REGISTER_SUBGRAPH_BRIDGE
(
conv2d
,
kHWAscendNPU
,
paddle
::
lite
::
subgraph
::
npu
::
ConvConverter
);
REGISTER_SUBGRAPH_BRIDGE
(
depthwise_conv2d
,
kHWAscendNPU
,
paddle
::
lite
::
subgraph
::
npu
::
ConvConverter
);
lite/kernels/hw_ascend_npu/bridges/graph.cc
浏览文件 @
5cab7cdd
...
@@ -27,7 +27,8 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
...
@@ -27,7 +27,8 @@ int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
if
(
it
!=
nodes_
.
end
())
{
if
(
it
!=
nodes_
.
end
())
{
// Only variable node can be shared with the same name
// Only variable node can be shared with the same name
if
(
!
node
->
is_var
()
||
!
it
->
second
.
back
()
->
is_var
())
{
if
(
!
node
->
is_var
()
||
!
it
->
second
.
back
()
->
is_var
())
{
LOG
(
FATAL
)
<<
"[NPU] Const or data node "
<<
name
<<
" is redefined."
;
LOG
(
FATAL
)
<<
"[HWAscendNPU] Const or data node "
<<
name
<<
" is redefined."
;
return
-
1
;
return
-
1
;
}
}
}
else
{
}
else
{
...
@@ -65,6 +66,13 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
...
@@ -65,6 +66,13 @@ std::shared_ptr<Node> Graph::Add(const std::string& name,
PrecisionType
precision
,
PrecisionType
precision
,
DataLayoutType
layout
)
{
DataLayoutType
layout
)
{
auto
node
=
Add
<
ge
::
op
::
Data
>
(
name
,
precision
,
layout
);
auto
node
=
Add
<
ge
::
op
::
Data
>
(
name
,
precision
,
layout
);
std
::
stringstream
iss
;
iss
<<
"[HWAscendNPU] Add data node, shape: "
;
for
(
auto
&
s
:
shape
)
{
iss
<<
s
<<
","
;
}
iss
<<
" name: "
<<
name
;
LOG
(
INFO
)
<<
iss
.
str
();
ge
::
TensorDesc
desc
(
ge
::
TensorDesc
desc
(
ge
::
Shape
(
shape
),
CvtDataLayoutType
(
layout
),
CvtPrecisionType
(
precision
));
ge
::
Shape
(
shape
),
CvtDataLayoutType
(
layout
),
CvtPrecisionType
(
precision
));
node
->
data
<
ge
::
op
::
Data
>
()
->
update_input_desc_data
(
desc
);
node
->
data
<
ge
::
op
::
Data
>
()
->
update_input_desc_data
(
desc
);
...
...
lite/kernels/hw_ascend_npu/bridges/graph.h
浏览文件 @
5cab7cdd
...
@@ -181,13 +181,14 @@ class Graph {
...
@@ -181,13 +181,14 @@ class Graph {
}
}
std
::
shared_ptr
<
Node
>
Get
(
std
::
string
name
)
{
std
::
shared_ptr
<
Node
>
Get
(
std
::
string
name
)
{
CHECK
(
Has
(
name
))
<<
"[NPU] Node "
<<
name
<<
" not found."
;
CHECK
(
Has
(
name
))
<<
"[
HWAscend
NPU] Node "
<<
name
<<
" not found."
;
return
nodes_
.
at
(
name
).
back
();
return
nodes_
.
at
(
name
).
back
();
}
}
bool
Has
(
const
std
::
string
&
name
)
{
bool
Has
(
const
std
::
string
&
name
)
{
return
nodes_
.
find
(
name
)
!=
nodes_
.
end
();
return
nodes_
.
find
(
name
)
!=
nodes_
.
end
();
}
}
size_t
size
()
const
{
return
nodes_
.
size
();
}
private:
private:
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
Node
>>>
nodes_
;
std
::
unordered_map
<
std
::
string
,
std
::
vector
<
std
::
shared_ptr
<
Node
>>>
nodes_
;
...
...
lite/kernels/hw_ascend_npu/bridges/utility.h
浏览文件 @
5cab7cdd
...
@@ -47,7 +47,7 @@ ge::Tensor CvtTensor(const Tensor& in_tensor,
...
@@ -47,7 +47,7 @@ ge::Tensor CvtTensor(const Tensor& in_tensor,
std
::
vector
<
int64_t
>
out_shape
=
{},
std
::
vector
<
int64_t
>
out_shape
=
{},
DataLayoutType
in_layout
=
DATALAYOUT
(
kNCHW
));
DataLayoutType
in_layout
=
DATALAYOUT
(
kNCHW
));
int
CvtActMode
(
std
::
string
act_type
);
int
CvtActMode
(
const
std
::
string
&
act_type
);
}
// namespace hw_ascend_npu
}
// namespace hw_ascend_npu
}
// namespace subgraph
}
// namespace subgraph
}
// namespace lite
}
// namespace lite
...
...
lite/kernels/hw_ascend_npu/subgraph_compute.cc
浏览文件 @
5cab7cdd
...
@@ -32,22 +32,31 @@ int SubgraphEngine::BuildDeviceProgram() {
...
@@ -32,22 +32,31 @@ int SubgraphEngine::BuildDeviceProgram() {
// the HWAscendNPU IR graph
// the HWAscendNPU IR graph
subgraph
::
hw_ascend_npu
::
Graph
graph
;
subgraph
::
hw_ascend_npu
::
Graph
graph
;
const
auto
&
bridges
=
subgraph
::
Registry
::
Instance
();
const
auto
&
bridges
=
subgraph
::
Registry
::
Instance
();
LOG
(
INFO
)
<<
"[HWAscendNPU] Build device program"
;
for
(
auto
&
inst
:
origin_program_
)
{
for
(
auto
&
inst
:
origin_program_
)
{
auto
op
=
const_cast
<
OpLite
*>
(
inst
.
op
());
auto
op
=
const_cast
<
OpLite
*>
(
inst
.
op
());
CHECK
(
op
);
CHECK
(
op
);
op
->
CheckShape
();
op
->
CheckShape
();
op
->
InferShape
();
op
->
InferShape
();
std
::
string
op_type
=
op
->
op_info
()
->
Type
();
std
::
string
op_type
=
op
->
op_info
()
->
Type
();
LOG
(
INFO
)
<<
"[HWAscendNPU] trying to convert OP: "
<<
op_type
;
if
(
!
bridges
.
Exists
(
op_type
,
TARGET
(
kHWAscendNPU
)))
{
if
(
!
bridges
.
Exists
(
op_type
,
TARGET
(
kHWAscendNPU
)))
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] OP: "
<<
op_type
<<
" does not exist for target HWAscendNPU"
;
return
subgraph
::
FAILED
;
return
subgraph
::
FAILED
;
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU] OP: "
<<
op_type
<<
" exists for HWAscendNPU"
;
auto
kernel
=
inst
.
kernel
();
auto
kernel
=
inst
.
kernel
();
status
|=
bridges
.
Select
(
op_type
,
TARGET
(
kHWAscendNPU
))(
status
|=
bridges
.
Select
(
op_type
,
TARGET
(
kHWAscendNPU
))(
reinterpret_cast
<
void
*>
(
&
graph
),
op
,
const_cast
<
KernelBase
*>
(
kernel
));
reinterpret_cast
<
void
*>
(
&
graph
),
op
,
const_cast
<
KernelBase
*>
(
kernel
));
if
(
subgraph
::
CHECK_FAILED
(
status
))
{
if
(
subgraph
::
CHECK_FAILED
(
status
))
{
LOG
(
ERROR
)
<<
"[HWAscendNPU] OP: "
<<
op_type
<<
" select kernel failed"
;
return
subgraph
::
FAILED
;
return
subgraph
::
FAILED
;
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU] OP: "
<<
op_type
<<
" select kernel for HWAscendNPU"
;
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU] Graph size: "
<<
graph
.
size
();
// Collect the valid input and output nodes in the HiAI IR graph and update
// Collect the valid input and output nodes in the HiAI IR graph and update
// the input and output names
// the input and output names
device_inames_
.
clear
();
device_inames_
.
clear
();
...
@@ -55,8 +64,12 @@ int SubgraphEngine::BuildDeviceProgram() {
...
@@ -55,8 +64,12 @@ int SubgraphEngine::BuildDeviceProgram() {
std
::
vector
<
ge
::
Operator
>
device_inodes
;
std
::
vector
<
ge
::
Operator
>
device_inodes
;
std
::
vector
<
ge
::
Operator
>
device_onodes
;
std
::
vector
<
ge
::
Operator
>
device_onodes
;
for
(
auto
&
input_name
:
input_names_
)
{
for
(
auto
&
input_name
:
input_names_
)
{
LOG
(
INFO
)
<<
"[HWAscendNPU] input name: "
<<
input_name
;
if
(
graph
.
Has
(
input_name
))
{
if
(
graph
.
Has
(
input_name
))
{
LOG
(
INFO
)
<<
"[HWAscendNPU] Graph has input name: "
<<
input_name
;
if
(
graph
.
Get
(
input_name
)
->
is_data
())
{
if
(
graph
.
Get
(
input_name
)
->
is_data
())
{
LOG
(
INFO
)
<<
"[HWAscendNPU] the current input name: "
<<
input_name
<<
" is data"
;
device_inodes
.
push_back
(
*
graph
.
Get
(
input_name
)
->
data
());
device_inodes
.
push_back
(
*
graph
.
Get
(
input_name
)
->
data
());
device_inames_
.
push_back
(
input_name
);
device_inames_
.
push_back
(
input_name
);
}
else
{
}
else
{
...
@@ -82,10 +95,15 @@ int SubgraphEngine::BuildDeviceProgram() {
...
@@ -82,10 +95,15 @@ int SubgraphEngine::BuildDeviceProgram() {
CHECK
(
!
device_onames_
.
empty
())
CHECK
(
!
device_onames_
.
empty
())
<<
"[HWAscendNPU] No output nodes found for building NPU model"
;
<<
"[HWAscendNPU] No output nodes found for building NPU model"
;
LOG
(
INFO
)
<<
"[HWAscendNPU] Graph size to build: "
<<
graph
.
size
();
// Build the IR graph to om model as the device program
// Build the IR graph to om model as the device program
if
(
device_program_map_
.
count
(
inputs_shape_
)
>
0
)
{
if
(
device_program_map_
.
count
(
inputs_shape_
)
>
0
)
{
return
status
;
return
status
;
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU] Start to build, device_inodes = "
<<
device_inodes
.
size
()
<<
", device_onodes = "
<<
device_onodes
.
size
();
auto
device_client
=
auto
device_client
=
lite
::
hw_ascend_npu
::
Device
::
Global
().
Build
(
device_inodes
,
device_onodes
);
lite
::
hw_ascend_npu
::
Device
::
Global
().
Build
(
device_inodes
,
device_onodes
);
if
(
device_client
==
nullptr
)
{
if
(
device_client
==
nullptr
)
{
...
@@ -188,12 +206,14 @@ int SubgraphEngine::LaunchDeviceProgram() {
...
@@ -188,12 +206,14 @@ int SubgraphEngine::LaunchDeviceProgram() {
// tensors
// tensors
auto
device_program
=
device_program_map_
[
inputs_shape_
];
auto
device_program
=
device_program_map_
[
inputs_shape_
];
int
ret
=
0
;
int
ret
=
0
;
LOG
(
INFO
)
<<
"[HWAscendNPU] start to set input..."
;
ret
=
device_program
->
client
->
SetInput
(
origin_itensors_
,
ret
=
device_program
->
client
->
SetInput
(
origin_itensors_
,
device_program
->
origin_idims
);
device_program
->
origin_idims
);
if
(
ret
!=
0
)
{
if
(
ret
!=
0
)
{
return
ret
;
return
ret
;
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU] start to create output..."
;
device_program
->
client
->
CreateOutput
(
device_program
->
origin_odims
);
device_program
->
client
->
CreateOutput
(
device_program
->
origin_odims
);
...
@@ -205,10 +225,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
...
@@ -205,10 +225,11 @@ int SubgraphEngine::LaunchDeviceProgram() {
};
};
auto
start_time
=
GetCurrentUS
();
auto
start_time
=
GetCurrentUS
();
CHECK_EQ
(
device_program
->
client
->
Process
(),
0
);
CHECK_EQ
(
device_program
->
client
->
Process
(),
0
);
VLOG
(
3
)
<<
"[HWAscendNPU] Process cost "
<<
GetCurrentUS
()
-
start_time
LOG
(
INFO
)
<<
"[HWAscendNPU] Process cost "
<<
GetCurrentUS
()
-
start_time
<<
" us"
;
<<
" us"
;
device_program
->
client
->
GetOutput
(
&
origin_otensors_
);
device_program
->
client
->
GetOutput
(
&
origin_otensors_
);
LOG
(
INFO
)
<<
"[HWAscendNPU] Get ouput done"
;
return
0
;
return
0
;
}
}
...
@@ -238,7 +259,9 @@ void SubgraphCompute::PrepareForRun() {
...
@@ -238,7 +259,9 @@ void SubgraphCompute::PrepareForRun() {
}
}
void
SubgraphCompute
::
Run
()
{
void
SubgraphCompute
::
Run
()
{
LOG
(
INFO
)
<<
"[HWAscendNPU] Start to run"
;
CHECK
(
engine_
);
CHECK
(
engine_
);
LOG
(
INFO
)
<<
"[HWAscendNPU] Start to call Launch"
;
engine_
->
Launch
();
engine_
->
Launch
();
}
}
...
...
lite/kernels/npu/bridges/engine.cc
浏览文件 @
5cab7cdd
...
@@ -105,15 +105,19 @@ bool Engine::InputShapeChanged() {
...
@@ -105,15 +105,19 @@ bool Engine::InputShapeChanged() {
}
}
int
Engine
::
Launch
()
{
int
Engine
::
Launch
()
{
LOG
(
INFO
)
<<
"[HWAscendNPU] in Launch, start to build if needed"
;
// Rebuild device program when the shapes of input tensors have been changed.
// Rebuild device program when the shapes of input tensors have been changed.
if
(
CHECK_SUCCESS
(
build_device_program_status_
)
&&
if
(
CHECK_SUCCESS
(
build_device_program_status_
)
&&
CHECK_REBUILD_WHEN_SHAPE_CHANGED
(
build_device_program_status_
)
&&
CHECK_REBUILD_WHEN_SHAPE_CHANGED
(
build_device_program_status_
)
&&
InputShapeChanged
())
{
InputShapeChanged
())
{
Build
();
Build
();
}
}
LOG
(
INFO
)
<<
"[HWAscendNPU] launch program"
;
if
(
CHECK_FAILED
(
build_device_program_status_
))
{
if
(
CHECK_FAILED
(
build_device_program_status_
))
{
LOG
(
INFO
)
<<
"[HWAscendNPU] launch original program"
;
LaunchOriginProgram
();
LaunchOriginProgram
();
}
else
{
}
else
{
LOG
(
INFO
)
<<
"[HWAscendNPU] launch device program"
;
LaunchDeviceProgram
();
LaunchDeviceProgram
();
}
}
return
0
;
return
0
;
...
...
lite/tests/kernels/CMakeLists.txt
浏览文件 @
5cab7cdd
...
@@ -9,7 +9,8 @@ set (kernels
...
@@ -9,7 +9,8 @@ set (kernels
${
host_kernels
}
${
host_kernels
}
${
hw_ascend_npu_kernels
}
)
${
hw_ascend_npu_kernels
}
)
if
((
NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU AND NOT LITE_WITH_HW_ASCEND_NPU
)
AND
(
LITE_WITH_X86 OR LITE_WITH_ARM
))
message
(
STATUS
"======---------------------------------=================
${
hw_ascend_npu_kernels
}
"
)
if
((
NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM AND NOT LITE_WITH_MLU
)
AND
(
LITE_WITH_X86 OR LITE_WITH_ARM
))
lite_cc_test
(
test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
${
hw_ascend_npu_kernels
}
)
lite_cc_test
(
test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
${
hw_ascend_npu_kernels
}
)
lite_cc_test
(
test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
${
hw_ascend_npu_kernels
}
)
lite_cc_test
(
test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
${
hw_ascend_npu_kernels
}
)
lite_cc_test
(
test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
${
hw_ascend_npu_kernels
}
)
lite_cc_test
(
test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework
${
xpu_kernels
}
${
npu_kernels
}
${
x86_kernels
}
${
cuda_kernels
}
${
arm_kernels
}
${
lite_ops
}
${
host_kernels
}
${
hw_ascend_npu_kernels
}
)
...
...
lite/tests/kernels/activation_compute_test.cc
浏览文件 @
5cab7cdd
...
@@ -293,6 +293,7 @@ TEST(Activation_relu, precision) {
...
@@ -293,6 +293,7 @@ TEST(Activation_relu, precision) {
place
=
TARGET
(
kXPU
);
place
=
TARGET
(
kXPU
);
#elif defined(LITE_WITH_HW_ASCEND_NPU)
#elif defined(LITE_WITH_HW_ASCEND_NPU)
place
=
TARGET
(
kHWAscendNPU
);
place
=
TARGET
(
kHWAscendNPU
);
std
::
cout
<<
"-----------test relu with hw_ascend_npu"
<<
std
::
endl
;
#else
#else
return
;
return
;
#endif
#endif
...
...
lite/tools/build_hw_ascend_npu.sh
浏览文件 @
5cab7cdd
...
@@ -3,7 +3,7 @@ set -ex
...
@@ -3,7 +3,7 @@ set -ex
# global variables with default value
# global variables with default value
ASCEND_HOME
=
"/usr/local/Ascend"
# Ascend SDK root directory
ASCEND_HOME
=
"/usr/local/Ascend"
# Ascend SDK root directory
TARGET_NAME
=
"test_
subgraph_pass
"
# default target
TARGET_NAME
=
"test_
kernel_activation_compute
"
# default target
BUILD_EXTRA
=
ON
# ON(with sequence ops)/OFF
BUILD_EXTRA
=
ON
# ON(with sequence ops)/OFF
WITH_TESTING
=
ON
# ON/OFF
WITH_TESTING
=
ON
# ON/OFF
...
@@ -80,7 +80,7 @@ function build_hw_ascend_npu {
...
@@ -80,7 +80,7 @@ function build_hw_ascend_npu {
-DWITH_TESTING
=
${
WITH_TESTING
}
\
-DWITH_TESTING
=
${
WITH_TESTING
}
\
-DASCEND_HOME
=
${
HW_ASCEND_NPU_SDK_ROOT
}
-DASCEND_HOME
=
${
HW_ASCEND_NPU_SDK_ROOT
}
make
-j
$NUM_CORES_FOR_COMPILE
make
$TARGET_NAME
-j2
cd
-
cd
-
echo
"Done"
echo
"Done"
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录