未验证 提交 769ba40b 编写于 作者: Q Qi Li 提交者: GitHub

[ASCEND] Add Huawei Ascend310 support (#3936)

* [ASCEND] Add Huawei Ascend310 support, test=develop

* [ASCEND] fix some typos, test=develop

* [ASCEND] address comments and fix opt ci python file, test=develop

* [ASCEND] update based on new ascend env, test=develop

* [ASCEND] update after develop merge, test=develop
上级 af7a9a96
...@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) ...@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF) lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_HUAWEI_ASCEND_NPU "Enable HUAWEI_ASCEND_NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
...@@ -225,6 +226,11 @@ endif() ...@@ -225,6 +226,11 @@ endif()
if(LITE_WITH_MLU) if(LITE_WITH_MLU)
include(mlu) include(mlu)
endif() endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
include(device/huawei_ascend_npu)
endif()
include(coveralls) include(coveralls)
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
......
...@@ -174,6 +174,10 @@ if (LITE_WITH_MLU) ...@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
add_definitions("-DLITE_WITH_MLU") add_definitions("-DLITE_WITH_MLU")
endif() endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
endif()
if (LITE_WITH_PROFILE) if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE") add_definitions("-DLITE_WITH_PROFILE")
endif() endif()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
# 1. path to Huawei Ascend Install Path
if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
endif()
endif()
message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
# 2. Huawei Ascend include directory
set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
include_directories(${ACL_INCLUDE_DIR})
include_directories(${ATC_INCLUDE_DIR})
include_directories(${OPP_INCLUDE_DIR})
# 3 find ACL Libs (ACL libs should before ATC libs)
find_library(ACL_ASCENDCL_FILE NAMES ascendcl
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_ASCENDCL_FILE)
message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
endif()
# 3.1 ascendcl dependency - libruntime.so
find_library(ACL_RUNTIME_FILE NAMES runtime
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_RUNTIME_FILE)
message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
add_library(acl_runtime SHARED IMPORTED GLOBAL)
set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
endif()
# 4.1 find ATC libs - libregister.so
find_library(ATC_REGISTER_FILE NAMES register
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
add_library(atc_register SHARED IMPORTED GLOBAL)
set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
endif()
# 4.1.1 dependency of register - libprotobuf.so.19,
find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
add_library(atc_protobuf SHARED IMPORTED GLOBAL)
set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
endif()
# 4.1.2 dependency of register - libgraph.so
find_library(ATC_GRAPH_FILE NAMES graph
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GRAPH_FILE)
message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
add_library(atc_graph SHARED IMPORTED GLOBAL)
set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
endif()
# 4.2 find ATC libs - libge_compiler.so
find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMPILER_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
endif()
# 4.2.1 dependencies of libge_compiler.so - libge_common.so
find_library(ATC_GE_COMMON_FILE NAMES ge_common
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMMON_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
add_library(atc_ge_common SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
endif()
# 4.2.3 dependencies of libge_compiler.so - libresource.so
find_library(ATC_RESOURCE_FILE NAMES resource
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_RESOURCE_FILE)
message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
add_library(atc_resource SHARED IMPORTED GLOBAL)
set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
endif()
# 4.3 find OPP libs - libopsproto.so
find_library(OPP_OPS_PROTO_FILE NAMES opsproto
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
NO_DEFAULT_PATH)
if(NOT OPP_OPS_PROTO_FILE)
message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
else()
message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
endif()
# 4.3.1 dependency of opp_ops_proto - liberror_manager.so
find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_ERROR_MANAGER_FILE)
message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
add_library(atc_error_manager SHARED IMPORTED GLOBAL)
set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
endif()
# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager
atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
...@@ -22,7 +22,7 @@ endfunction() ...@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET) function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS) set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -118,6 +118,12 @@ function (lite_deps TARGET) ...@@ -118,6 +118,12 @@ function (lite_deps TARGET)
endforeach(var) endforeach(var)
endif() endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE) set(${TARGET} ${deps} PARENT_SCOPE)
endfunction() endfunction()
...@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean ...@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET) function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -165,6 +171,7 @@ function(lite_cc_library TARGET) ...@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
) )
if (args_SHARED OR ARGS_shared) if (args_SHARED OR ARGS_shared)
...@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET) ...@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
set(options " -g ") set(options " -g ")
endif() endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET) ...@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS} CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
) )
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
if(NOT WIN32) if(NOT WIN32)
...@@ -246,7 +254,7 @@ function(lite_cc_test TARGET) ...@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
endif() endif()
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS ARGS
COMPILE_LEVEL # (basic|extra) COMPILE_LEVEL # (basic|extra)
...@@ -276,6 +284,7 @@ function(lite_cc_test TARGET) ...@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS} CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
) )
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS}) _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size # strip binary target to reduce size
...@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels") ...@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
set(apu_kernels CACHE INTERNAL "apu kernels") set(apu_kernels CACHE INTERNAL "apu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels") set(bm_kernels CACHE INTERNAL "bm kernels")
set(rknpu_kernels CACHE INTERNAL "rknpu kernels") set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels")
...@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR) ...@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif() endif()
# add a kernel for some specific device # add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU) # device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra) # level: one of (basic, extra)
function(add_kernel TARGET device level) function(add_kernel TARGET device level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -438,6 +448,15 @@ function(add_kernel TARGET device level) ...@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
endif() endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL") if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL) if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
...@@ -481,6 +500,7 @@ function(add_kernel TARGET device level) ...@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
RKNPU_DEPS ${args_RKNPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -499,7 +519,7 @@ endif() ...@@ -499,7 +519,7 @@ endif()
function(add_operator TARGET level) function(add_operator TARGET level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -537,6 +557,7 @@ function(add_operator TARGET level) ...@@ -537,6 +557,7 @@ function(add_operator TARGET level)
RKNPU_DEPS ${args_RKNPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
......
...@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}") ...@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}") message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}") message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
......
...@@ -11,7 +11,7 @@ endif() ...@@ -11,7 +11,7 @@ endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light) set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library #full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full) DEPS paddle_api paddle_api_light paddle_api_full)
...@@ -40,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ...@@ -40,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels} APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
) )
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if(WIN32) if(WIN32)
target_link_libraries(paddle_light_api_shared shlwapi.lib) target_link_libraries(paddle_light_api_shared shlwapi.lib)
endif() endif()
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
if(APPLE) if(APPLE)
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds") set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}") set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
...@@ -94,6 +95,7 @@ if (WITH_TESTING) ...@@ -94,6 +95,7 @@ if (WITH_TESTING)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
APU_DEPS ${apu_kernels}) APU_DEPS ${apu_kernels})
endif() endif()
...@@ -112,6 +114,10 @@ if(LITE_WITH_RKNPU) ...@@ -112,6 +114,10 @@ if(LITE_WITH_RKNPU)
set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps}) set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
endif() endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
endif()
message(STATUS "get ops ${ops}") message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get X86 kernels ${x86_kernels}")
...@@ -126,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}") ...@@ -126,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}") message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}") message(STATUS "get MLU kernels ${mlu_kernels}")
message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")
# for full api # for full api
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
...@@ -144,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -144,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}) FPGA_DEPS ${fpga_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif() endif()
# for light api # for light api
...@@ -168,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -168,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}) MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
include(ExternalProject) include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
...@@ -191,6 +200,7 @@ if(WITH_TESTING) ...@@ -191,6 +200,7 @@ if(WITH_TESTING)
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
EXCLUDE_COMPILE_DEPS "ON" EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
...@@ -322,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -322,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
APU_DEPS ${apu_kernels} APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}) BM_DEPS ${bm_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
# The final inference library for just MobileConfig. # The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps}) target_link_libraries(paddle_api_full ${cuda_deps})
...@@ -394,6 +405,7 @@ if(NOT WITH_COVERAGE) ...@@ -394,6 +405,7 @@ if(NOT WITH_COVERAGE)
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL) ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING) if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz) add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
...@@ -415,7 +427,8 @@ if(NOT IOS) ...@@ -415,7 +427,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -430,7 +443,8 @@ if(NOT IOS) ...@@ -430,7 +443,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -445,7 +459,8 @@ if(NOT IOS) ...@@ -445,7 +459,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -459,7 +474,8 @@ if(NOT IOS) ...@@ -459,7 +474,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
...@@ -470,8 +486,9 @@ if(NOT IOS) ...@@ -470,8 +486,9 @@ if(NOT IOS)
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels})
...@@ -487,7 +504,8 @@ if(NOT IOS) ...@@ -487,7 +504,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif() endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc #lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
...@@ -79,6 +79,12 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -79,6 +79,12 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
Context<TargetType::kNPU>::SetSubgraphModelCacheDir( Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir()); raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif #endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
int num_threads = config.x86_math_library_num_threads(); int num_threads = config.x86_math_library_num_threads();
......
...@@ -43,6 +43,12 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) { ...@@ -43,6 +43,12 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
Context<TargetType::kNPU>::SetSubgraphModelCacheDir( Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir()); raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif #endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
} }
std::unique_ptr<lite_api::Tensor> LightPredictorImpl::GetInput(int i) { std::unique_ptr<lite_api::Tensor> LightPredictorImpl::GetInput(int i) {
......
...@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() { ...@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)}); valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
} else if (target_repr == "npu") { } else if (target_repr == "npu") {
valid_places.emplace_back(TARGET(kNPU)); valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU)); valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") { } else if (target_repr == "mlu") {
...@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kXPU", "kXPU",
"kRKNPU", "kRKNPU",
"kAPU", "kAPU",
"kHuaweiAscendNPU",
"kAny", "kAny",
"kUnk"}; "kUnk"};
int maximum_optype_length = 0; int maximum_optype_length = 0;
...@@ -265,16 +268,17 @@ void PrintHelpInfo() { ...@@ -265,16 +268,17 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n" " `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n" " `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " "
"`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n" " Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of " " `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n" "Paddle-Lite\n"
" `--print_supported_ops=true " " `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display valid operators of input targets\n" " Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> " " `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display operators in the input model\n"; " Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl; << help_info << std::endl;
......
...@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { ...@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.emplace_back(TARGET(kX86)); valid_places_.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") { } else if (target_repr == "npu") {
valid_places_.emplace_back(TARGET(kNPU)); valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU)); valid_places_.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") { } else if (target_repr == "rknpu") {
...@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() { ...@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
" `set_model_type(protobuf|naive_buffer)`: naive_buffer by " " `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
"default\n" "default\n"
" `set_lite_out(output_optimize_model_dir)`\n" " `set_lite_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " "
"`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `record_model_info(false|true)`: refer to whether to record ops " " `record_model_info(false|true)`: refer to whether to record ops "
"info for striping lib, false by default`\n" "info for striping lib, false by default`\n"
" `run() : start model transformation`\n" " `run() : start model transformation`\n"
...@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() { ...@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
" `--param_file=<param_path>`\n" " `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n" " `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" " `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n" " Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of " " `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n" "Paddle-Lite\n"
" `--print_supported_ops=true " " `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display valid operators of input targets\n" " Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> " " `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display operators in the input model\n"; " Display operators in the input model\n";
std::cout << "paddlelite opt version:" << opt_version << std::endl std::cout << "paddlelite opt version:" << opt_version << std::endl
<< help_info << std::endl; << help_info << std::endl;
...@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) { ...@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
"kXPU", "kXPU",
"kRKNPU", "kRKNPU",
"kAPU", "kAPU",
"kHuaweiAscendNPU",
"kAny", "kAny",
"kUnk"}; "kUnk"};
// Get the lengh of the first column: maximum length of the op_type // Get the lengh of the first column: maximum length of the op_type
......
...@@ -126,6 +126,7 @@ class LITE_API ConfigBase { ...@@ -126,6 +126,7 @@ class LITE_API ConfigBase {
PowerMode mode_{LITE_POWER_NO_BIND}; PowerMode mode_{LITE_POWER_NO_BIND};
// to save subgraph model for npu/xpu/... // to save subgraph model for npu/xpu/...
std::string subgraph_model_cache_dir_{""}; std::string subgraph_model_cache_dir_{""};
int device_id_{0};
public: public:
explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1); explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
...@@ -145,6 +146,9 @@ class LITE_API ConfigBase { ...@@ -145,6 +146,9 @@ class LITE_API ConfigBase {
const std::string& subgraph_model_cache_dir() const { const std::string& subgraph_model_cache_dir() const {
return subgraph_model_cache_dir_; return subgraph_model_cache_dir_;
} }
// set Device ID
void set_device_id(int device_id) { device_id_ = device_id; }
const int get_device_id() const { return device_id_; }
}; };
/// CxxConfig is the config for the Full feature predictor. /// CxxConfig is the config for the Full feature predictor.
......
...@@ -75,7 +75,8 @@ const std::string& TargetToStr(TargetType target) { ...@@ -75,7 +75,8 @@ const std::string& TargetToStr(TargetType target) {
"bm", "bm",
"mlu", "mlu",
"rknpu", "rknpu",
"apu"}; "apu",
"huawei_ascend_npu"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -118,7 +119,8 @@ const std::string& TargetRepr(TargetType target) { ...@@ -118,7 +119,8 @@ const std::string& TargetRepr(TargetType target) {
"kBM", "kBM",
"kMLU", "kMLU",
"kRKNPU", "kRKNPU",
"kAPU"}; "kAPU",
"kHuaweiAscendNPU"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -163,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) { ...@@ -163,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kMLU), TARGET(kMLU),
TARGET(kAPU), TARGET(kAPU),
TARGET(kRKNPU), TARGET(kRKNPU),
TARGET(kFPGA)}); TARGET(kFPGA),
TARGET(kHuaweiAscendNPU)});
if (target == TARGET(kAny)) { if (target == TARGET(kAny)) {
return valid_set; return valid_set;
} }
......
...@@ -57,7 +57,8 @@ enum class TargetType : int { ...@@ -57,7 +57,8 @@ enum class TargetType : int {
kMLU = 11, kMLU = 11,
kRKNPU = 12, kRKNPU = 12,
kAPU = 13, kAPU = 13,
NUM = 14, // number of fields. kHuaweiAscendNPU = 14,
NUM = 15, // number of fields.
}; };
enum class PrecisionType : int { enum class PrecisionType : int {
kUnk = 0, kUnk = 0,
......
...@@ -48,6 +48,7 @@ USE_MIR_PASS(memory_optimize_pass); ...@@ -48,6 +48,7 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(multi_stream_analysis_pass); USE_MIR_PASS(multi_stream_analysis_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass); USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(mlu_postprocess_pass);
......
...@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) { ...@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
.value("MLU", TargetType::kMLU) .value("MLU", TargetType::kMLU)
.value("RKNPU", TargetType::kRKNPU) .value("RKNPU", TargetType::kRKNPU)
.value("APU", TargetType::kAPU) .value("APU", TargetType::kAPU)
.value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
.value("Any", TargetType::kAny); .value("Any", TargetType::kAny);
// PrecisionType // PrecisionType
......
...@@ -10,3 +10,4 @@ add_subdirectory(mlu) ...@@ -10,3 +10,4 @@ add_subdirectory(mlu)
add_subdirectory(bm) add_subdirectory(bm)
add_subdirectory(apu) add_subdirectory(apu)
add_subdirectory(rknpu) add_subdirectory(rknpu)
add_subdirectory(huawei_ascend_npu)
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/huawei_ascend_npu/device.h"
#include <map>
#include <utility>
#include "ge/ge_api_types.h"
#include "ge/ge_ir_build.h"
#include "graph/graph.h"
#include "lite/utils/io.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
std::shared_ptr<AclModelClient> Device::LoadFromMem(
const std::vector<char>& model_buffer, const int device_id) {
if (model_buffer.size() == 0) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!";
return nullptr;
}
// Create a ACL model client to load the om model
std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
// Load model from memory
if (model_client->LoadFromMem(
reinterpret_cast<const void*>(model_buffer.data()),
model_buffer.size())) {
return model_client;
}
return nullptr;
}
std::shared_ptr<AclModelClient> Device::LoadFromFile(
const std::string& model_path, const int device_id) {
if (!paddle::lite::IsFileExists(model_path)) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path;
return nullptr;
}
// Create a ACL model client to load the om model
std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
// Load model from memory
if (model_client->LoadFromFile(model_path.c_str())) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
return model_client;
}
return nullptr;
}
std::mutex Device::device_mutex_;
bool Device::Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer) {
std::lock_guard<std::mutex> lock(device_mutex_);
// Convert the HiAI IR graph to the HiAI om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
// Build IR model
ge::ModelBufferData om_buffer;
std::map<std::string, std::string> options;
options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error"));
ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer));
// Copy from om model buffer
model_buffer->resize(om_buffer.length);
memcpy(reinterpret_cast<void*>(model_buffer->data()),
reinterpret_cast<void*>(om_buffer.data.get()),
om_buffer.length);
return true;
}
void Device::InitOnce() {
if (runtime_inited_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!";
return;
}
// ACL runtime init => can only be called once in one process
ACL_CALL(aclInit(NULL));
// ATC builder init => can only be called once in one process
std::map<std::string, std::string> global_options;
global_options.insert(
std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310"));
ATC_CALL(ge::aclgrphBuildInitialize(global_options));
runtime_inited_ = true;
}
void Device::DestroyOnce() {
if (!runtime_inited_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!";
return;
}
// ATC builder finalize => can only be called once in one process
ge::aclgrphBuildFinalize();
// ACL runtime finalize => can only be called once in one process
ACL_CALL(aclFinalize());
runtime_inited_ = false;
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <vector>
#include "lite/backends/huawei_ascend_npu/model_client.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() { InitOnce(); }
~Device() { DestroyOnce(); }
std::shared_ptr<AclModelClient> LoadFromMem(
const std::vector<char>& model_buffer, const int device_id);
std::shared_ptr<AclModelClient> LoadFromFile(const std::string& model_path,
const int device_id);
// Build the ACL IR graph to the ACL om model
bool Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer); // NOLINT
private:
void InitOnce();
void DestroyOnce();
bool runtime_inited_{false};
static std::mutex device_mutex_;
};
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/huawei_ascend_npu/model_client.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
bool AclModelClient::LoadFromMem(const void* data, uint32_t size) {
if (load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
return true;
}
auto ret = aclmdlQuerySizeFromMem(
data, size, &model_memory_size_, &model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!";
return false;
}
ret = aclrtMalloc(
&model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
"failed, require size is "
<< model_memory_size_;
return false;
}
ret = aclrtMalloc(
&model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
"failed, require size is "
<< model_weight_size_;
return false;
}
ret = aclmdlLoadFromMemWithMem(data,
size,
&model_id_,
model_memory_ptr_,
model_memory_size_,
model_weight_ptr_,
model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
return false;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
return false;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success.";
load_flag_ = true;
return true;
}
bool AclModelClient::LoadFromFile(const char* model_path) {
if (load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
return true;
}
auto ret =
aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!";
return false;
}
ret = aclrtMalloc(
&model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
"failed, require size is "
<< model_memory_size_;
return false;
}
ret = aclrtMalloc(
&model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
"failed, require size is "
<< model_weight_size_;
return false;
}
ret = aclmdlLoadFromFileWithMem(model_path,
&model_id_,
model_memory_ptr_,
model_memory_size_,
model_weight_ptr_,
model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!";
return false;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
return false;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
load_flag_ = true;
return true;
}
bool AclModelClient::GetModelIOTensorDim(
std::vector<TensorDesc>* input_tensor,
std::vector<TensorDesc>* output_tensor) {
if (!model_desc_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!";
return false;
}
size_t input_num = aclmdlGetNumInputs(model_desc_);
VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num;
for (size_t i = 0; i < input_num; i++) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ....";
aclmdlIODims input_dim;
aclmdlGetInputDims(model_desc_, i, &input_dim);
aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is "
<< data_type;
aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is "
<< data_format;
TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
input_tensor->push_back(tensor_desc);
}
size_t output_num = aclmdlGetNumOutputs(model_desc_);
VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num;
for (size_t i = 0; i < output_num; i++) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ....";
aclmdlIODims output_dim;
aclmdlGetOutputDims(model_desc_, i, &output_dim);
aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is "
<< data_type;
aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is "
<< data_format;
TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
output_tensor->push_back(tensor_desc);
}
return true;
}
bool AclModelClient::GetTensorFromDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
if (device_output_num != tensor_output_num) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
<< device_output_num << "tensor number is " << tensor_output_num;
return false;
}
for (size_t i = 0; i < device_output_num; i++) {
aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
void* device_data = aclGetDataBufferAddr(buffer_device);
uint32_t device_size = aclGetDataBufferSize(buffer_device);
void* tensor_data = nullptr;
aclError ret = aclrtMallocHost(&tensor_data, device_size);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret;
return false;
}
ret = aclrtMemcpy(tensor_data,
device_size,
device_data,
device_size,
ACL_MEMCPY_DEVICE_TO_HOST);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret;
return false;
}
if (output_tensor->at(i)->SetData(reinterpret_cast<uint8_t*>(tensor_data),
device_size) != ge::GRAPH_SUCCESS) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed";
return false;
}
}
VLOG(3)
<< "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed.";
return true;
}
void AclModelClient::CreateInputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor) {
input_dataset_ = aclmdlCreateDataset();
if (input_dataset_ == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!";
return;
}
for (size_t i = 0; i < input_tensor->size(); i++) {
auto item = input_tensor->at(i);
size_t buffer_size = item->GetSize();
void* buffer_device = nullptr;
aclError ret =
aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is "
<< buffer_size;
return;
}
void* buffer_data = reinterpret_cast<void*>(item->GetData());
ret = aclrtMemcpy(buffer_device,
buffer_size,
buffer_data,
buffer_size,
ACL_MEMCPY_HOST_TO_DEVICE);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is "
<< buffer_size;
aclrtFree(buffer_device);
return;
}
aclDataBuffer* data_buffer =
aclCreateDataBuffer(buffer_device, buffer_size);
if (data_buffer == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
aclrtFree(buffer_device);
return;
}
if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!";
aclrtFree(buffer_device);
aclDestroyDataBuffer(data_buffer);
return;
}
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed.";
}
void AclModelClient::CreateOutputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
output_dataset_ = aclmdlCreateDataset();
if (output_dataset_ == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!";
return;
}
size_t output_size = aclmdlGetNumOutputs(model_desc_);
CHECK_EQ(output_size, output_tensor->size());
for (size_t i = 0; i < output_size; i++) {
size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
void* buffer_device = nullptr;
aclError ret =
aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is "
<< buffer_size;
return;
}
aclDataBuffer* data_buffer =
aclCreateDataBuffer(buffer_device, buffer_size);
if (data_buffer == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
aclrtFree(buffer_device);
return;
}
if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) !=
ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!";
aclrtFree(buffer_device);
aclDestroyDataBuffer(data_buffer);
return;
}
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed.";
}
bool AclModelClient::ModelExecute(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
// check model exists
if (model_desc_ == nullptr) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] no model description, model execution failed!";
return false;
}
// create input/output dataset
CreateInputDataset(input_tensor);
CreateOutputDataset(output_tensor);
// model execution
ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_));
// get output
if (!GetTensorFromDataset(output_tensor)) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:"
<< model_id_;
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
<< model_id_;
return true;
}
void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
if (*dataset == nullptr) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!";
return;
}
size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset);
for (size_t i = 0; i < dataset_num; i++) {
aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i);
void* device_data = aclGetDataBufferAddr(buffer_device);
if (device_data == nullptr) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!";
} else {
if (aclrtFree(device_data) != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!";
}
}
if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!";
}
}
if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!";
}
*dataset = nullptr;
VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
}
bool AclModelClient::UnloadModel() {
if (!load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
<< load_flag_;
return true;
}
DestroyDataset(&input_dataset_);
DestroyDataset(&output_dataset_);
aclError ret = aclmdlUnload(model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "unload model failed, model id is " << model_id_;
return false;
}
if (model_desc_ != nullptr) {
(void)aclmdlDestroyDesc(model_desc_);
model_desc_ = nullptr;
}
if (model_memory_ptr_ != nullptr) {
aclrtFree(model_memory_ptr_);
model_memory_ptr_ = nullptr;
model_memory_size_ = 0;
}
if (model_weight_ptr_ != nullptr) {
aclrtFree(model_weight_ptr_);
model_weight_ptr_ = nullptr;
model_weight_size_ = 0;
}
load_flag_ = false;
VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
return true;
}
uint32_t AclModelClient::num_devices() {
uint32_t count = 0;
ACL_CALL(aclrtGetDeviceCount(&count));
return count;
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/huawei_ascend_npu/utils.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
class TensorDesc {
public:
TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
if (format == ACL_FORMAT_NHWC) {
dim_order[1] = 3;
dim_order[2] = 1;
dim_order[3] = 2;
}
// create ge::Tensordesc
ge_tensor_desc_ = new ge::TensorDesc(
GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
CHECK(ge_tensor_desc_ != nullptr);
}
~TensorDesc() { ge_tensor_desc_ = nullptr; }
int64_t GetNumber() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[0]);
}
int64_t GetChannel() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[1]);
}
int64_t GetHeight() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[2]);
}
int64_t GetWidth() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[3]);
}
const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; }
private:
ge::Shape GetGeShape(aclmdlIODims dims) {
ge::Shape ge_shape({0, 0, 0, 0});
for (size_t i = 0; i < dims.dimCount; i++) {
if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!";
} else {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <"
<< dims.dims[i] << ">";
}
}
return ge_shape;
}
ge::Format GetGeFormat(aclFormat format) {
ge::Format ge_format = ge::FORMAT_NCHW;
switch (format) {
case ACL_FORMAT_NCHW:
ge_format = ge::FORMAT_NCHW;
break;
case ACL_FORMAT_NHWC:
ge_format = ge::FORMAT_NHWC;
break;
case ACL_FORMAT_ND:
ge_format = ge::FORMAT_ND;
break;
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format;
break;
}
return ge_format;
}
ge::DataType GetGeDataType(aclDataType data_type) {
ge::DataType ge_datatype = ge::DT_FLOAT;
switch (data_type) {
case ACL_FLOAT:
ge_datatype = ge::DT_FLOAT;
break;
case ACL_FLOAT16:
ge_datatype = ge::DT_FLOAT16;
break;
case ACL_INT8:
ge_datatype = ge::DT_INT8;
break;
case ACL_INT16:
ge_datatype = ge::DT_INT16;
break;
case ACL_INT32:
ge_datatype = ge::DT_INT32;
break;
case ACL_INT64:
ge_datatype = ge::DT_INT64;
break;
case ACL_BOOL:
ge_datatype = ge::DT_BOOL;
break;
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!";
break;
}
return ge_datatype;
}
private:
ge::TensorDesc* ge_tensor_desc_{nullptr};
// n c h w order, default to ACL_FORMAT_NCHW
std::vector<size_t> dim_order{0, 1, 2, 3};
};
class AclModelClient {
public:
explicit AclModelClient(int device_id) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: "
<< device_id;
device_num_ = num_devices();
if (device_id < 0 || device_id >= device_num_) {
LOG(FATAL) << "Failed with invalid device id " << device_id;
return;
}
device_id_ = device_id;
ACL_CALL(aclrtSetDevice(device_id_));
}
~AclModelClient() {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
<< device_id_;
ACL_CALL(aclrtResetDevice(device_id_));
}
bool LoadFromMem(const void* data, uint32_t size);
bool LoadFromFile(const char* model_path);
bool GetModelIOTensorDim(std::vector<TensorDesc>* input_tensor,
std::vector<TensorDesc>* output_tensor);
bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
bool UnloadModel();
private:
void CreateInputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor);
void CreateOutputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
bool GetTensorFromDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
void DestroyDataset(aclmdlDataset** dataset);
private:
uint32_t num_devices();
private:
int device_id_{0};
int device_num_{0};
aclrtContext context_{nullptr};
bool load_flag_{false};
uint32_t model_id_{0};
size_t model_memory_size_;
size_t model_weight_size_;
void* model_memory_ptr_;
void* model_weight_ptr_;
aclmdlDesc* model_desc_{nullptr};
aclmdlDataset* input_dataset_{nullptr};
aclmdlDataset* output_dataset_{nullptr};
};
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "acl/acl.h"
#include "ge/ge_api_types.h"
#include "ge/ge_ir_build.h"
#include "graph/ge_error_codes.h"
#include "graph/graph.h"
#include "graph/tensor.h"
#include "graph/types.h"
#include "lite/utils/cp_logging.h"
/*
* This file contains some Huawei Ascend NPU specific uitls.
*/
#define ACL_CALL(msg) \
CHECK_EQ(reinterpret_cast<aclError>(msg), ACL_ERROR_NONE) \
<< (msg) << " Huawei Ascend NPU ACL Error: " \
<< ::paddle::lite::huawei_ascend_npu::AclErrorInfo( \
reinterpret_cast<int>(msg))
#define ATC_CALL(msg) \
CHECK_EQ(reinterpret_cast<ge::graphStatus>(msg), ge::GRAPH_SUCCESS) \
<< (msg) << " Huawei Ascend NPU ATC Error: " \
<< ::paddle::lite::huawei_ascend_npu::AtcErrorInfo( \
reinterpret_cast<uint32_t>(msg))
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
static const char* AtcErrorInfo(uint32_t error) {
switch (error) {
#define LITE_ATC_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED); // 0xFFFFFFFF
LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID); // 50331649
#undef LITE_ATC_ERROR_INFO
default:
return "unknown error";
break;
}
}
static const char* AclErrorInfo(int error) {
switch (error) {
#define LITE_ACL_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM); // 100000
LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE); // 100001
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE); // 100002
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE); // 100003
LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE); // 100004
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE); // 100005
LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE); // 100006
LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR); // 100007
LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID); // 100008
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG); // 100009
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG); // 100010
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID); // 100011
LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL); // 100012
LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL); // 100013
LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE); // 100014
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID); // 100015
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR); // 100016
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH); // 100017
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH); // 100018
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC); // 100019
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH); // 100020
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH); // 100021
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH); // 100022
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH); // 100023
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND); // 100024
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED); // 100025
LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE); // 100026
LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH); // 100027
LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED); // 100028
LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND); // 100029
LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED); // 100030
LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED); // 100031
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID); // 100032
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE); // 100033
LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE); // 100034
LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE); // 100035
LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT); // 100036
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE); // 100037
LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP); // 100038
LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC); // 200000
LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT); // 200001
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE); // 200002
LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED); // 200003
LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH); // 200004
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE); // 200005
LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED); // 200006
LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT); // 300000
LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR); // 500000
LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE); // 500001
LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE); // 500002
LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE); // 500003
LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE); // 500004
LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE); // 500005
#undef LITE_ACL_ERROR_INFO
default:
return "unknown error";
break;
}
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
...@@ -6,5 +6,5 @@ endif() ...@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif() endif()
...@@ -24,7 +24,7 @@ namespace arena { ...@@ -24,7 +24,7 @@ namespace arena {
void TestCase::CreateInstruction() { void TestCase::CreateInstruction() {
std::shared_ptr<lite::OpLite> op = nullptr; std::shared_ptr<lite::OpLite> op = nullptr;
static const std::set<TargetType> subgraph_op_supported_targets( static const std::set<TargetType> subgraph_op_supported_targets(
{TARGET(kNPU), TARGET(kXPU)}); {TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)});
bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) != bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
subgraph_op_supported_targets.end(); subgraph_op_supported_targets.end();
#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL) #if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
...@@ -48,7 +48,15 @@ void TestCase::CreateInstruction() { ...@@ -48,7 +48,15 @@ void TestCase::CreateInstruction() {
auto out_names = sub_op_desc->output_vars(); auto out_names = sub_op_desc->output_vars();
op_desc_->SetInput("Inputs", in_names); op_desc_->SetInput("Inputs", in_names);
op_desc_->SetOutput("Outputs", out_names); op_desc_->SetOutput("Outputs", out_names);
op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names); // filter only data op (not const op by persisiable)
std::vector<std::string> in_data_names;
for (auto name : in_names) {
if (!(inst_scope_->FindTensor(name)->persistable())) {
in_data_names.push_back(name);
}
}
op_desc_->SetAttr<std::vector<std::string>>("input_data_names",
in_data_names);
op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names); op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
op = LiteOpRegistry::Global().Create(op_desc().Type()); op = LiteOpRegistry::Global().Create(op_desc().Type());
static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc( static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
......
...@@ -17,6 +17,15 @@ ...@@ -17,6 +17,15 @@
namespace paddle { namespace paddle {
namespace lite { namespace lite {
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
thread_local std::string
Context<TargetType::kHuaweiAscendNPU>::subgraph_model_cache_dir_{
""}; // NOLINT
thread_local int
Context<TargetType::kHuaweiAscendNPU>::huawei_ascend_device_id_{
0}; // NOLINT
#endif
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
int Context<TargetType::kMLU>::next_queue_id_{0}; int Context<TargetType::kMLU>::next_queue_id_{0};
std::map<int, int> Context<TargetType::kMLU>::queue_id_map_; std::map<int, int> Context<TargetType::kMLU>::queue_id_map_;
......
...@@ -62,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>; ...@@ -62,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>;
using BMContext = Context<TargetType::kBM>; using BMContext = Context<TargetType::kBM>;
using MLUContext = Context<TargetType::kMLU>; using MLUContext = Context<TargetType::kMLU>;
using RKNPUContext = Context<TargetType::kRKNPU>; using RKNPUContext = Context<TargetType::kRKNPU>;
using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;
template <> template <>
class Context<TargetType::kHost> { class Context<TargetType::kHost> {
...@@ -101,6 +102,37 @@ class Context<TargetType::kNPU> { ...@@ -101,6 +102,37 @@ class Context<TargetType::kNPU> {
}; };
#endif #endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
template <>
class Context<TargetType::kHuaweiAscendNPU> {
public:
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(HuaweiAscendNPUContext* ctx) {}
HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) {
return *this;
}
std::string name() const { return "HuaweiAscendNPUContext"; }
static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
subgraph_model_cache_dir_ = subgraph_model_cache_dir;
}
static std::string SubgraphModelCacheDir() {
return subgraph_model_cache_dir_;
}
static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) {
huawei_ascend_device_id_ = huawei_ascend_device_id;
}
static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; }
private:
static thread_local std::string subgraph_model_cache_dir_;
static thread_local int huawei_ascend_device_id_;
};
#endif
#ifdef LITE_WITH_APU #ifdef LITE_WITH_APU
template <> template <>
class Context<TargetType::kAPU> { class Context<TargetType::kAPU> {
...@@ -390,6 +422,13 @@ class ContextScheduler { ...@@ -390,6 +422,13 @@ class ContextScheduler {
&ctx->As<NPUContext>()); &ctx->As<NPUContext>());
break; break;
#endif #endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
case TARGET(kHuaweiAscendNPU):
kernel_contexts_[TargetType::kHuaweiAscendNPU]
.As<HuaweiAscendNPUContext>()
.CopySharedTo(&ctx->As<HuaweiAscendNPUContext>());
break;
#endif
#ifdef LITE_WITH_APU #ifdef LITE_WITH_APU
case TARGET(kAPU): case TARGET(kAPU):
kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo( kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
...@@ -471,6 +510,9 @@ class ContextScheduler { ...@@ -471,6 +510,9 @@ class ContextScheduler {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
InitContext<TargetType::kNPU, NPUContext>(); InitContext<TargetType::kNPU, NPUContext>();
#endif #endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
InitContext<TargetType::kHuaweiAscendNPU, HuaweiAscendNPUContext>();
#endif
#ifdef LITE_WITH_APU #ifdef LITE_WITH_APU
InitContext<TargetType::kAPU, APUContext>(); InitContext<TargetType::kAPU, APUContext>();
#endif #endif
......
...@@ -315,4 +315,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) ...@@ -315,4 +315,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
TARGET(kBM), TARGET(kBM),
TARGET(kRKNPU), TARGET(kRKNPU),
TARGET(kAPU), TARGET(kAPU),
TARGET(kMLU)}); TARGET(kMLU),
TARGET(kHuaweiAscendNPU)});
...@@ -199,6 +199,9 @@ TEST(Subgraph, detect_custom_model) { ...@@ -199,6 +199,9 @@ TEST(Subgraph, detect_custom_model) {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
Place{TARGET(kNPU), PRECISION(kFloat)}, Place{TARGET(kNPU), PRECISION(kFloat)},
#endif #endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_XTCL #ifdef LITE_WITH_XTCL
Place{TARGET(kXPU), PRECISION(kFloat)}, Place{TARGET(kXPU), PRECISION(kFloat)},
#endif #endif
......
...@@ -40,6 +40,21 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -40,6 +40,21 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser(); fuser();
} }
void HuaweiAscendNPUSubgraphPass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
std::set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::set<std::string> supported_lists; std::set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) \ #define USE_SUBGRAPH_BRIDGE(op_type, target) \
...@@ -119,6 +134,9 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -119,6 +134,9 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass) REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
.BindTargets({TARGET(kNPU)}); .BindTargets({TARGET(kNPU)});
REGISTER_MIR_PASS(huawei_ascend_npu_subgraph_pass,
paddle::lite::mir::HuaweiAscendNPUSubgraphPass)
.BindTargets({TARGET(kHuaweiAscendNPU)});
REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass) REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
.BindTargets({TARGET(kAPU)}); .BindTargets({TARGET(kAPU)});
REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
......
...@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass { ...@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
}; };
class HuaweiAscendNPUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class APUSubgraphPass : public ProgramPass { class APUSubgraphPass : public ProgramPass {
public: public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
......
...@@ -187,6 +187,10 @@ TEST(Subgraph, generate_model_and_check_precision) { ...@@ -187,6 +187,10 @@ TEST(Subgraph, generate_model_and_check_precision) {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}); valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
#endif #endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
valid_places.push_back(
lite_api::Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)});
#endif
#ifdef LITE_WITH_XTCL #ifdef LITE_WITH_XTCL
valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}); valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
#endif #endif
......
...@@ -117,6 +117,7 @@ class Optimizer { ...@@ -117,6 +117,7 @@ class Optimizer {
// 'enable_int8' for all // 'enable_int8' for all
// of the quantized ops. // of the quantized ops.
"npu_subgraph_pass", "npu_subgraph_pass",
"huawei_ascend_npu_subgraph_pass",
"xpu_subgraph_pass", "xpu_subgraph_pass",
"bm_subgraph_pass", "bm_subgraph_pass",
"apu_subgraph_pass", "apu_subgraph_pass",
......
...@@ -84,6 +84,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) { ...@@ -84,6 +84,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
lod_ = other.lod_; lod_ = other.lod_;
memory_size_ = other.memory_size_; memory_size_ = other.memory_size_;
precision_ = other.precision_; precision_ = other.precision_;
persistable_ = other.persistable_;
buffer_->CopyDataFrom(*other.buffer_, memory_size_); buffer_->CopyDataFrom(*other.buffer_, memory_size_);
} }
......
...@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc ...@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
...@@ -44,6 +45,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co ...@@ -44,6 +45,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
......
...@@ -14,3 +14,4 @@ add_subdirectory(mlu) ...@@ -14,3 +14,4 @@ add_subdirectory(mlu)
add_subdirectory(apu) add_subdirectory(apu)
add_subdirectory(bm) add_subdirectory(bm)
add_subdirectory(rknpu) add_subdirectory(rknpu)
add_subdirectory(huawei_ascend_npu)
add_subdirectory(bridges)
add_kernel(subgraph_compute_huawei_ascend_npu HUAWEI_ASCEND_NPU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_huawei_ascend_npu subgraph_bridge_engine ${huawei_ascend_npu_subgraph_bridges})
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
lite_cc_library(subgraph_bridge_utility_huawei_ascend_npu SRCS utility.cc DEPS)
lite_cc_library(subgraph_bridge_graph_huawei_ascend_npu SRCS graph.cc DEPS subgraph_bridge_utility_huawei_ascend_npu)
set(huawei_ascend_npu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_engine subgraph_bridge_utility_huawei_ascend_npu subgraph_bridge_graph_huawei_ascend_npu)
lite_cc_library(subgraph_bridge_act_op_huawei_ascend_npu SRCS act_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_conv_op_huawei_ascend_npu SRCS conv_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
set(huawei_ascend_npu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_engine
subgraph_bridge_graph_huawei_ascend_npu
subgraph_bridge_act_op_huawei_ascend_npu
subgraph_bridge_conv_op_huawei_ascend_npu
CACHE INTERNAL "huawei_ascend_npu_subgraph_bridges")
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace huawei_ascend_npu {
template <typename ActType>
int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto out_name = op_info->Output("Out").front();
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
// Act node
auto act_node = graph->template Add<ActType>(out_name);
auto act_op = act_node->template data<ActType>();
act_op->set_input_x(*x_node->data());
return SUCCESS;
}
template <>
int ActConverter<ge::op::LeakyRelu>(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto out_name = op_info->Output("Out").front();
// X node
std::shared_ptr<Node> x_node = nullptr;
if (graph->Has(x_name)) {
x_node = graph->Get(x_name);
} else {
x_node = graph->Add(x_name, *x);
}
// Act node
auto act_node = graph->template Add<ge::op::LeakyRelu>(out_name);
auto act_op = act_node->template data<ge::op::LeakyRelu>();
act_op->set_input_x(*x_node->data());
// only for leaky_relu
auto alpha = op_info->GetAttr<float>("alpha");
act_op->set_attr_negative_slope(alpha);
return SUCCESS;
}
} // namespace huawei_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(
sigmoid,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Sigmoid>);
REGISTER_SUBGRAPH_BRIDGE(
relu,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Relu>);
REGISTER_SUBGRAPH_BRIDGE(
tanh,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Tanh>);
REGISTER_SUBGRAPH_BRIDGE(
relu6,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Relu6>);
REGISTER_SUBGRAPH_BRIDGE(
leaky_relu,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::LeakyRelu>);
REGISTER_SUBGRAPH_BRIDGE(
softsign,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Softsign>);
REGISTER_SUBGRAPH_BRIDGE(
softplus,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ActConverter<ge::op::Softplus>);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/operators/conv_op.h"
#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace huawei_ascend_npu {
int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto graph = static_cast<Graph*>(ctx);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " << op_type << "... ";
// Get input and output vars and op attributes
auto input_name = op_info->Input("Input").front();
auto input = scope->FindMutableTensor(input_name);
auto input_dims = input->dims();
ge::DataType ge_data_type = CvtPrecisionType(input->precision());
auto filter_name = op_info->Input("Filter").front();
auto filter = scope->FindMutableTensor(filter_name);
auto filter_dims = filter->dims();
auto output_name = op_info->Output("Output").front();
auto output = scope->FindMutableTensor(output_name);
auto output_dims = output->dims();
auto bs = input_dims[0];
auto ic = input_dims[1];
auto oc = filter_dims[0];
CHECK_EQ(input_dims.size(), 4L);
CHECK_EQ(output_dims.size(), 4L);
CHECK_EQ(filter_dims.size(), 4L);
CHECK_EQ(output_dims[0], bs);
CHECK_EQ(output_dims[1], oc);
auto strides = op_info->GetAttr<std::vector<int>>("strides");
auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
auto groups = op_info->GetAttr<int>("groups");
// Conv2D: groups must set to 1; DepthwiseConv2D: groups not supported.
CHECK_LE(groups, 1)
<< "[HUAWEI_ASCEND_NPU] groups > 1 NOT supported, groups: " << groups;
auto dilations = op_info->GetAttr<std::vector<int>>("dilations");
bool with_act =
op_info->HasAttr("with_act") && op_info->GetAttr<bool>("with_act");
std::string act_type =
with_act ? op_info->GetAttr<std::string>("act_type") : "";
float leaky_relu_alpha = act_type == "leaky_relu"
? op_info->GetAttr<float>("leaky_relu_alpha")
: 0.f;
CHECK_EQ(strides.size(), 2L);
CHECK_EQ(dilations.size(), 2L);
// Input node
std::shared_ptr<Node> input_node = nullptr;
if (graph->Has(input_name)) {
input_node = graph->Get(input_name);
} else {
input_node = graph->Add(input_name, *input);
}
if (paddings.size() == 2L) {
for (size_t i = 0; i < strides.size(); ++i) {
int copy_pad = *(paddings.begin() + 2 * i);
paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
}
}
CHECK_EQ(paddings.size(), 4L)
<< "[HUAWEI_ASCEND_NPU] Paddings size should be "
"the same or twice as the input size.";
std::string padding_algorithm("");
if (op_info->HasAttr("padding_algorithm")) {
padding_algorithm = op_info->GetAttr<std::string>("padding_algorithm");
}
operators::UpdatePaddingAndDilation(&paddings,
&dilations,
strides,
padding_algorithm,
input_dims,
filter_dims);
// Check depthwise mode, and decide whether use DepthwiseConv2D Op
bool use_depthwise_conv = false;
bool is_depthwise_mode = (ic == groups && oc == groups && groups != 1);
if (is_depthwise_mode && dilations[0] == 1 && dilations[1] == 1) {
use_depthwise_conv = true;
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] DepthwiseConv2D op is used.";
}
// Filter node
auto filter_node = graph->Add(filter_name, *filter);
// Add bias node if exists bias
// Supports the bias nodes with the following dimensions
// 0: {oc} => 1D tensor of foramt ND
// 1: {1, oc, oh, ow}
// 2: {n, oc, oh, ow}
std::vector<int64_t> bias_shape;
std::shared_ptr<Node> bias_node = nullptr;
bool is_channel_bias = false;
if (HasInputArg(op_info, scope, "Bias")) {
auto bias_name = op_info->Input("Bias").front();
if (graph->Has(bias_name)) {
bias_node = graph->Get(bias_name);
} else {
auto bias = scope->FindMutableTensor(bias_name);
auto bias_dims = bias->dims();
auto bias_data_size = bias_dims.production();
auto output_data_size = output_dims.production();
if (bias_data_size == oc) {
// 0: {oc}
bias_shape = {oc};
is_channel_bias = true;
} else if (bias_data_size == output_data_size / bs) {
// 1: {1, oc, oh, ow}
bias_shape = {1, output_dims[1], output_dims[2], output_dims[3]};
} else if (bias_data_size == output_data_size) {
// 2: {n, oc, oh, ow}
bias_shape = output_dims.Vectorize();
} else {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] Bias dimension " << bias_dims
<< " isn't supported in conv2d Op when output dimension is "
<< output_dims;
return FAILED;
}
bias_node = graph->Add(bias_name, *bias, bias_shape);
}
}
// Ascend must update convop desc, or IR model build will fail
ge::TensorDesc conv2d_input_desc_x(
ge::Shape(CvtShape(input_dims)), ge::FORMAT_NCHW, ge_data_type);
ge::TensorDesc conv2d_input_desc_filter(
ge::Shape(CvtShape(filter_dims)), ge::FORMAT_NCHW, ge_data_type);
ge::TensorDesc conv2d_input_desc_bias(
ge::Shape(bias_shape), ge::FORMAT_ND, ge_data_type);
ge::TensorDesc conv2d_output_desc_y(
ge::Shape(CvtShape(output_dims)), ge::FORMAT_NCHW, ge_data_type);
// Setting desc name
conv2d_input_desc_x.SetName("conv2d_input_desc_x");
conv2d_input_desc_filter.SetName("conv2d_input_desc_filter");
conv2d_input_desc_bias.SetName("conv2d_input_desc_bias");
conv2d_output_desc_y.SetName("conv2d_output_desc_y");
// Conv node
std::shared_ptr<Node> conv_node = nullptr;
if (use_depthwise_conv && is_depthwise_mode) {
conv_node = graph->Add<ge::op::DepthwiseConv2D>(output_name);
auto conv_op = conv_node->data<ge::op::DepthwiseConv2D>();
conv_op->set_input_x(*input_node->data());
conv_op->set_input_filter(*filter_node->data());
conv_op->set_attr_strides(
ge::Operator::OpListInt({1, 1, strides[0], strides[1]}));
conv_op->set_attr_dilations({1, 1, dilations[0], dilations[1]});
conv_op->set_attr_pads(
{paddings[0], paddings[1], paddings[2], paddings[3]});
conv_op->set_attr_data_format("NCHW");
if (bias_node != nullptr && is_channel_bias) {
conv_op->set_input_bias(*bias_node->data());
conv_op->update_input_desc_bias(conv2d_input_desc_bias);
}
// update tensor desc to conv2d
conv_op->update_input_desc_x(conv2d_input_desc_x);
conv_op->update_input_desc_filter(conv2d_input_desc_filter);
conv_op->update_output_desc_y(conv2d_output_desc_y);
} else {
conv_node = graph->Add<ge::op::Conv2D>(output_name);
auto conv_op = conv_node->data<ge::op::Conv2D>();
conv_op->set_input_x(*input_node->data());
conv_op->set_input_filter(*filter_node->data());
conv_op->set_attr_strides(
ge::Operator::OpListInt({bs, ic, strides[0], strides[1]}));
conv_op->set_attr_pads(ge::Operator::OpListInt(
{paddings[0], paddings[1], paddings[2], paddings[3]}));
conv_op->set_attr_dilations(
ge::Operator::OpListInt({bs, ic, dilations[0], dilations[1]}));
conv_op->set_attr_groups(groups);
conv_op->set_attr_data_format("NCHW");
if (bias_node != nullptr && is_channel_bias) {
conv_op->set_input_bias(*bias_node->data());
conv_op->update_input_desc_bias(conv2d_input_desc_bias);
}
// update tensor desc to conv2d
conv_op->update_input_desc_x(conv2d_input_desc_x);
conv_op->update_input_desc_filter(conv2d_input_desc_filter);
conv_op->update_output_desc_y(conv2d_output_desc_y);
}
// append Add node to support bias
if (bias_node != nullptr && !is_channel_bias) {
auto add_node = graph->Add<ge::op::Add>(output_name);
auto add_op = add_node->data<ge::op::Add>();
add_op->set_input_x1(*conv_node->data());
add_op->set_input_x2(*bias_node->data());
conv_node = add_node;
}
CHECK(conv_node);
// ONLY support relu/leaky_relu now
// to do (@qili93): add more act types
if (!act_type.empty()) {
if (act_type == "relu") {
auto act_node = graph->Add<ge::op::Relu>(output_name);
auto act_op = act_node->data<ge::op::Relu>();
act_op->set_input_x(*conv_node->data());
} else if (act_type == "leaky_relu") {
auto act_node = graph->Add<ge::op::LeakyRelu>(output_name);
auto act_op = act_node->data<ge::op::LeakyRelu>();
act_op->set_input_x(*conv_node->data());
act_op->set_attr_negative_slope(leaky_relu_alpha);
} else {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] act type not supported: "
<< act_type;
return FAILED;
}
}
return REBUILD_WHEN_SHAPE_CHANGED;
}
} // namespace huawei_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(
conv2d,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ConvConverter);
REGISTER_SUBGRAPH_BRIDGE(
depthwise_conv2d,
kHuaweiAscendNPU,
paddle::lite::subgraph::huawei_ascend_npu::ConvConverter);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
#include <utility>
#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace huawei_ascend_npu {
int Graph::Add(const std::string& name, std::shared_ptr<Node> node) {
auto it = nodes_.find(name);
if (it != nodes_.end()) {
// Only variable node can be shared with the same name
if (!node->is_var() || !it->second.back()->is_var()) {
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Const or data node " << name
<< " is redefined.";
return -1;
}
} else {
auto ret = nodes_.insert(
std::make_pair(name, std::vector<std::shared_ptr<Node>>()));
CHECK(ret.second);
it = ret.first;
}
it->second.push_back(node);
return it->second.size();
}
// Const or data node
std::shared_ptr<Node> Graph::Add(const std::string& name,
const Tensor& tensor,
std::vector<int64_t> shape,
DataLayoutType layout) {
std::shared_ptr<Node> node = nullptr;
PrecisionType precision = tensor.precision();
if (tensor.persistable()) {
// Const node
node = Add<ge::op::Const>(name, precision, layout);
ge::TensorDesc desc(ge::Shape(shape),
CvtDataLayoutType(layout),
CvtPrecisionType(precision));
desc.SetName("const_node_desc");
node->data<ge::op::Const>()->set_attr_value(
CvtTensor(tensor, shape, layout));
node->data<ge::op::Const>()->update_output_desc_y(desc);
} else {
// Data node
node = Add(name, shape, precision, layout);
}
return node;
}
// Data node
std::shared_ptr<Node> Graph::Add(const std::string& name,
std::vector<int64_t> shape,
PrecisionType precision,
DataLayoutType layout) {
auto node = Add<ge::op::Data>(name, precision, layout);
ge::TensorDesc desc(
ge::Shape(shape), CvtDataLayoutType(layout), CvtPrecisionType(precision));
desc.SetName("data_node_desc");
node->data<ge::op::Data>()->update_input_desc_x(desc);
node->data<ge::op::Data>()->update_output_desc_y(desc);
return node;
}
} // namespace huawei_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "lite/core/op_lite.h"
#include "lite/core/tensor.h"
#include "op_proto/built-in/inc/all_ops.h" // opp/op_proto/built-in/inc
namespace paddle {
namespace lite {
namespace subgraph {
namespace huawei_ascend_npu {
// Graph and node is defined to collect all of converted HiAI IR nodes
class Node {
public:
enum class Role {
kVar = 0,
kConst,
kData,
};
Node(std::shared_ptr<ge::Operator> data,
PrecisionType precision,
DataLayoutType layout,
Role role)
: data_(data), precision_(precision), layout_(layout), role_(role) {}
Node(PrecisionType precision, DataLayoutType layout, Role role)
: precision_(precision), layout_(layout), role_(role) {}
void set_data(std::shared_ptr<ge::Operator> data) { data_ = data; }
void set_precision(PrecisionType precision) { precision_ = precision; }
void set_layout(DataLayoutType layout) { layout_ = layout; }
void set_role(Role role) { role_ = role; }
template <typename T>
std::shared_ptr<T> data() {
return std::static_pointer_cast<T>(data_);
}
std::shared_ptr<ge::Operator> data() { return data_; }
PrecisionType precision() const { return precision_; }
DataLayoutType layout() const { return layout_; }
bool is_var() const { return role_ == Role::kVar; }
bool is_const() const { return role_ == Role::kConst; }
bool is_data() const { return role_ == Role::kData; }
private:
std::shared_ptr<ge::Operator> data_{nullptr};
PrecisionType precision_{PRECISION(kFloat)};
DataLayoutType layout_{DATALAYOUT(kNCHW)};
Role role_{Role::kVar};
};
class Graph {
public:
int Add(const std::string& name, std::shared_ptr<Node> node);
// Variable, const or data node
template <typename T>
std::shared_ptr<Node> Add(const std::string& name,
PrecisionType precision = PRECISION(kFloat),
DataLayoutType layout = DATALAYOUT(kNCHW)) {
Node::Role role = Node::Role::kVar;
if (typeid(T) == typeid(ge::op::Const)) {
role = Node::Role::kConst;
} else if (typeid(T) == typeid(ge::op::Data)) {
role = Node::Role::kData;
}
auto node = std::make_shared<Node>(precision, layout, role);
auto idx = Add(name, node);
CHECK_GE(idx, 1);
// Generate a unique name for the created HiAI IR
node->set_data(
std::make_shared<T>(name + "__" + paddle::lite::to_string(idx)));
return node;
}
// Const or data node
std::shared_ptr<Node> Add(const std::string& name,
const Tensor& tensor,
std::vector<int64_t> shape,
DataLayoutType layout = DATALAYOUT(kNCHW));
std::shared_ptr<Node> Add(const std::string& name,
const Tensor& tensor,
DataLayoutType layout = DATALAYOUT(kNCHW)) {
return Add(name, tensor, tensor.dims().Vectorize(), layout);
}
std::shared_ptr<Node> Add(const std::string& name,
const Tensor& tensor,
DDim dims,
DataLayoutType layout = DATALAYOUT(kNCHW)) {
return Add(name, tensor, dims.Vectorize(), layout);
}
// Const node
template <typename T>
std::shared_ptr<Node> Add(const std::string& name,
const std::vector<T>& data,
std::vector<int64_t> shape = {},
DataLayoutType layout = DATALAYOUT(kNCHW)) {
if (shape.empty()) {
shape = {static_cast<int64_t>(data.size())};
} else {
int size = 1;
for (auto i : shape) {
size *= i;
}
CHECK_EQ(data.size(), size);
}
Tensor tensor;
tensor.Resize(shape);
tensor.set_persistable(true);
std::memcpy(reinterpret_cast<uint8_t*>(tensor.mutable_data<T>()),
reinterpret_cast<const uint8_t*>(data.data()),
data.size() * sizeof(T));
return Add(name, tensor, layout);
}
template <typename T>
std::shared_ptr<Node> Add(const std::string& name,
const std::vector<T>& data,
DDim dims,
DataLayoutType layout = DATALAYOUT(kNCHW)) {
return Add(name, data, dims.Vectorize(), layout);
}
template <typename T>
std::shared_ptr<Node> Add(const std::string& name,
T value,
std::vector<int64_t> shape = {1},
DataLayoutType layout = DATALAYOUT(kNCHW)) {
int64_t size = 1;
for (auto i : shape) {
size *= i;
}
std::vector<T> data(size, value);
return Add(name, data, shape, layout);
}
template <typename T>
std::shared_ptr<Node> Add(const std::string& name,
T value,
DDim dims,
DataLayoutType layout = DATALAYOUT(kNCHW)) {
return Add(name, value, dims.Vectorize(), layout);
}
// Data node
std::shared_ptr<Node> Add(const std::string& name,
std::vector<int64_t> shape,
PrecisionType precision = PRECISION(kFloat),
DataLayoutType layout = DATALAYOUT(kNCHW));
std::shared_ptr<Node> Add(const std::string& name,
DDim dims,
PrecisionType precision = PRECISION(kFloat),
DataLayoutType layout = DATALAYOUT(kNCHW)) {
return Add(name, dims.Vectorize(), precision, layout);
}
std::shared_ptr<Node> Get(std::string name) {
CHECK(Has(name)) << "[HUAWEI_ASCEND_NPU] Node " << name << " not found.";
return nodes_.at(name).back();
}
bool Has(const std::string& name) {
return nodes_.find(name) != nodes_.end();
}
private:
std::map<std::string, std::vector<std::shared_ptr<Node>>> nodes_;
};
} // namespace huawei_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// activation
USE_SUBGRAPH_BRIDGE(sigmoid, kHuaweiAscendNPU);
USE_SUBGRAPH_BRIDGE(relu, kHuaweiAscendNPU);
USE_SUBGRAPH_BRIDGE(tanh, kHuaweiAscendNPU);
USE_SUBGRAPH_BRIDGE(relu6, kHuaweiAscendNPU);
USE_SUBGRAPH_BRIDGE(leaky_relu, kHuaweiAscendNPU);
USE_SUBGRAPH_BRIDGE(softsign, kHuaweiAscendNPU);
USE_SUBGRAPH_BRIDGE(softplus, kHuaweiAscendNPU);
// conv
USE_SUBGRAPH_BRIDGE(conv2d, kHuaweiAscendNPU);
USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kHuaweiAscendNPU);
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
#include <utility>
namespace paddle {
namespace lite {
namespace subgraph {
namespace huawei_ascend_npu {
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname) {
auto iarg_names = op_info->input_argnames();
if (std::find(iarg_names.begin(), iarg_names.end(), argname) !=
iarg_names.end()) {
auto inputs = op_info->Input(argname);
if (inputs.empty()) {
return false;
}
auto var_name = inputs.front();
auto var = scope->FindVar(var_name);
return var != nullptr;
} else {
return false;
}
}
ge::DataType CvtPrecisionType(PrecisionType itype) {
ge::DataType otype = ge::DT_FLOAT;
switch (itype) {
case PRECISION(kFloat):
otype = ge::DT_FLOAT;
break;
case PRECISION(kFP16):
otype = ge::DT_FLOAT16;
break;
case PRECISION(kInt8):
otype = ge::DT_INT8;
break;
case PRECISION(kInt16):
otype = ge::DT_INT16;
break;
case PRECISION(kInt32):
otype = ge::DT_INT32;
break;
case PRECISION(kInt64):
otype = ge::DT_INT64;
break;
// TODO(liq27) support more precision type
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Can not convert precision type("
<< PrecisionToStr(itype) << ") from Lite to NPU";
break;
}
return otype;
}
ge::Format CvtDataLayoutType(DataLayoutType itype) {
ge::Format otype = ge::FORMAT_NCHW;
switch (itype) {
case DATALAYOUT(kNCHW):
otype = ge::FORMAT_NCHW;
break;
case DATALAYOUT(kNHWC):
otype = ge::FORMAT_NHWC;
break;
// TODO(liq27) support more data layout type
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Can not convert data layout type("
<< DataLayoutToStr(itype)
<< ") from Lite to HUAWEI_ASCEND_NPU";
break;
}
return otype;
}
std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape) {
std::vector<int64_t> out_shape;
// Padding the shape to 4-dimensions(NCHW)
for (size_t i = 0; i < 4 - in_shape.size(); i++) {
out_shape.push_back(1);
}
for (size_t i = 0; i < in_shape.size(); i++) {
out_shape.push_back(in_shape[i]);
}
return out_shape;
}
std::vector<int64_t> CvtShape(const DDim& in_dims) {
return CvtShape(in_dims.Vectorize());
}
ge::Tensor CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape,
DataLayoutType in_layout) {
PrecisionType in_precision = in_tensor.precision();
auto in_size = in_tensor.dims().production();
auto in_shape = in_tensor.dims().Vectorize();
if (out_shape.empty()) {
out_shape = in_shape;
}
ge::TensorDesc out_desc(ge::Shape(out_shape),
CvtDataLayoutType(in_layout),
CvtPrecisionType(in_precision));
auto out_size = out_desc.GetShape().GetShapeSize();
CHECK_EQ(out_size, in_size);
ge::Tensor out_tensor;
out_tensor.SetTensorDesc(out_desc);
out_tensor.SetData(reinterpret_cast<const uint8_t*>(in_tensor.raw_data()),
in_tensor.memory_size());
return out_tensor;
}
int CvtActMode(std::string act_type) {
int act_mode = 1;
if (act_type == "sigmoid") {
act_mode = 0;
} else if (act_type == "relu") {
act_mode = 1;
} else if (act_type == "tanh") {
act_mode = 2;
} else if (act_type == "relu_clipped" || act_type == "relu6") {
act_mode = 3;
} else if (act_type == "elu") {
act_mode = 4;
} else if (act_type == "leaky_relu") {
act_mode = 5;
} else if (act_type == "abs") {
act_mode = 6;
} else if (act_type == "softsign") {
act_mode = 8;
} else if (act_type == "softplus") {
act_mode = 9;
} else if (act_type == "hard_sigmoid") {
act_mode = 10;
} else if (act_type == "thresholded_relu") {
act_mode = 11;
} else {
// TODO(liqi27) support more activation mode
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] Unsupported activation type "
<< act_type;
}
return act_mode;
}
const std::string& CvtFormat(ge::Format format) {
static const int MAX_FORMAT_LENGTH = 25;
static const std::string format2string[] = {
"FORMAT_NCHW = 0",
"FORMAT_NHWC = 1",
"FORMAT_ND = 2",
"FORMAT_NC1HWC0 = 3",
"FORMAT_FRACTAL_Z = 4",
"FORMAT_NC1C0HWPAD = 5",
"FORMAT_NHWC1C0 = 6",
"FORMAT_FSR_NCHW = 7",
"FORMAT_FRACTAL_DECONV = 8",
"FORMAT_C1HWNC0 = 9",
"FORMAT_FRACTAL_DECONV_TRANSPOSE = 10",
"FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS = 11",
"FORMAT_NC1HWC0_C04 = 12",
"FORMAT_FRACTAL_Z_C04 = 13",
"FORMAT_CHWN = 14",
"FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS = 15",
"FORMAT_HWCN = 16",
"FORMAT_NC1KHKWHWC0 = 17",
"FORMAT_BN_WEIGHT = 18",
"FORMAT_FILTER_HWCK = 19",
"FORMAT_HASHTABLE_LOOKUP_LOOKUPS = 20",
"FORMAT_HASHTABLE_LOOKUP_KEYS = 21",
"FORMAT_HASHTABLE_LOOKUP_VALUE = 22",
"FORMAT_HASHTABLE_LOOKUP_OUTPUT = 23",
"FORMAT_HASHTABLE_LOOKUP_HITS = 24"};
auto x = static_cast<int>(format);
CHECK_LT(x, MAX_FORMAT_LENGTH);
return format2string[x];
}
const std::string& CvtDataType(ge::DataType data_type) {
static const int MAX_DATATYPE_LENGTH = 14;
static const std::string datatype2string[] = {"DT_FLOAT=0",
"DT_FLOAT16=1",
"DT_INT8=2",
"DT_INT32=3",
"DT_UINT8=4",
"Unknown=5",
"DT_INT16=6",
"DT_UINT16=7",
"DT_UINT32=8",
"DT_INT64=9",
"DT_UINT64=10",
"DT_DOUBLE=11",
"DT_BOOL=12",
"DT_STRING=13"};
auto x = static_cast<int>(data_type);
CHECK_LT(x, MAX_DATATYPE_LENGTH);
return datatype2string[x];
}
} // namespace huawei_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <functional>
#include <map>
#include <memory>
#include <string>
#include <vector>
// #include "graph/buffer.h"
#include "graph/tensor.h"
#include "graph/types.h"
#include "lite/core/op_lite.h"
#include "lite/utils/macros.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace huawei_ascend_npu {
// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor
bool HasInputArg(const OpInfo* op_info,
const Scope* scope,
const std::string& argname);
ge::DataType CvtPrecisionType(PrecisionType itype);
ge::Format CvtDataLayoutType(DataLayoutType itype);
// Padding the shape to 4-dimensions(NCHW) for HiAI
std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape);
std::vector<int64_t> CvtShape(const DDim& in_dims);
ge::Tensor CvtTensor(const Tensor& in_tensor,
std::vector<int64_t> out_shape = {},
DataLayoutType in_layout = DATALAYOUT(kNCHW));
int CvtActMode(std::string act_type);
const std::string& CvtFormat(ge::Format format);
const std::string& CvtDataType(ge::DataType data_type);
} // namespace huawei_ascend_npu
} // namespace subgraph
} // namespace lite
} // namespace paddle
此差异已折叠。
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <memory>
#include <string>
#include <vector>
#include "graph/tensor.h"
#include "lite/backends/huawei_ascend_npu/device.h"
#include "lite/core/kernel.h"
#include "lite/kernels/npu/bridges/engine.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace huawei_ascend_npu {
using TensorDesc = paddle::lite::huawei_ascend_npu::TensorDesc;
using AclModelClient = paddle::lite::huawei_ascend_npu::AclModelClient;
class DeviceProgram {
public:
DeviceProgram() {}
~DeviceProgram() {}
std::string GenerateModelName(
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
const std::vector<std::vector<int64_t>>& origin_idims);
bool LoadFromCacheFile(const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
const std::vector<std::vector<int64_t>>& origin_idims,
const std::string& model_cache_dir,
const int device_id);
bool BuildGraphAndCacheToFile(
const std::vector<Instruction>& origin_program,
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
const std::vector<std::vector<int64_t>>& origin_idims,
const std::vector<Tensor*>& origin_otensors,
const std::string& model_cache_dir,
const int device_id);
bool ShareBufferWithOriginTensors(
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
std::vector<Tensor*>* origin_itensors,
std::vector<Tensor*>* origin_otensors,
std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
bool SharedBufferWithOutputTensors(
const std::vector<std::string>& output_names,
std::vector<Tensor*>* origin_otensors,
std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
bool ZeroCopyRun(std::vector<std::shared_ptr<ge::Tensor>>* device_itensors,
std::vector<std::shared_ptr<ge::Tensor>>* device_otensors);
public:
std::string model_name_{""};
std::shared_ptr<AclModelClient> model_client_{nullptr};
std::vector<std::vector<int64_t>> origin_odims_;
std::vector<PrecisionType> origin_otypes_;
std::vector<TensorDesc> device_idims_{};
std::vector<TensorDesc> device_odims_{};
};
class SubgraphEngine : public subgraph::Engine {
public:
SubgraphEngine(KernelContext* ctx,
int block_idx,
cpp::BlockDesc* block_desc,
const std::vector<std::string>& input_names,
const std::vector<std::string>& output_names,
Scope* scope)
: subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {}
protected:
bool PrepareWorkspaceForDeviceProgram() override;
bool BuildDeviceProgram() override;
bool LaunchDeviceProgram() override;
private:
std::vector<std::shared_ptr<ge::Tensor>> device_itensors_{};
std::vector<std::shared_ptr<ge::Tensor>> device_otensors_{};
std::map<std::vector<std::vector<int64_t>>, std::shared_ptr<DeviceProgram>>
device_programs_;
};
class SubgraphCompute
: public KernelLite<TARGET(kHuaweiAscendNPU), PRECISION(kAny)> {
public:
using param_t = operators::SubgraphParam;
void PrepareForRun() override;
void Run() override;
virtual ~SubgraphCompute() = default;
private:
std::unique_ptr<SubgraphEngine> engine_;
};
} // namespace huawei_ascend_npu
} // namespace kernels
} // namespace lite
} // namespace paddle
if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU) if(NOT LITE_WITH_NPU AND NOT LITE_WITH_XTCL AND NOT LITE_WITH_BM AND NOT LITE_WITH_RKNPU AND NOT LITE_WITH_MLU AND NOT LITE_WITH_APU AND NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return() return()
endif() endif()
......
此差异已折叠。
...@@ -302,6 +302,9 @@ TEST(Activation_relu, precision) { ...@@ -302,6 +302,9 @@ TEST(Activation_relu, precision) {
place = TARGET(kARM); place = TARGET(kARM);
#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
place = TARGET(kXPU); place = TARGET(kXPU);
#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
place = TARGET(kHuaweiAscendNPU);
abs_error = 1e-2; // Using fp16 in NPU
#else #else
return; return;
#endif #endif
...@@ -324,6 +327,9 @@ TEST(Activation_leaky_relu, precision) { ...@@ -324,6 +327,9 @@ TEST(Activation_leaky_relu, precision) {
abs_error = 1e-2; // Using fp16 in NPU abs_error = 1e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_ARM) #elif defined(LITE_WITH_ARM)
place = TARGET(kARM); place = TARGET(kARM);
#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
place = TARGET(kHuaweiAscendNPU);
abs_error = 1e-2; // Using fp16 in NPU
#else #else
return; return;
#endif #endif
...@@ -404,6 +410,9 @@ TEST(Activation_sigmoid, precision) { ...@@ -404,6 +410,9 @@ TEST(Activation_sigmoid, precision) {
abs_error = 1e-2; // Using fp16 in NPU abs_error = 1e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_ARM) #elif defined(LITE_WITH_ARM)
place = TARGET(kARM); place = TARGET(kARM);
#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
place = TARGET(kHuaweiAscendNPU);
abs_error = 1e-2; // Using fp16 in NPU
#else #else
return; return;
#endif #endif
...@@ -428,6 +437,9 @@ TEST(Activation_tanh, precision) { ...@@ -428,6 +437,9 @@ TEST(Activation_tanh, precision) {
place = TARGET(kARM); place = TARGET(kARM);
#elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL) #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
place = TARGET(kXPU); place = TARGET(kXPU);
#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
place = TARGET(kHuaweiAscendNPU);
abs_error = 1e-2; // Using fp16 in NPU
#else #else
return; return;
#endif #endif
...@@ -467,6 +479,9 @@ TEST(Activation_relu6, precision) { ...@@ -467,6 +479,9 @@ TEST(Activation_relu6, precision) {
abs_error = 1e-2; // Using fp16 in NPU abs_error = 1e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_ARM) #elif defined(LITE_WITH_ARM)
place = TARGET(kARM); place = TARGET(kARM);
#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
place = TARGET(kHuaweiAscendNPU);
abs_error = 1e-2; // Using fp16 in NPU
#else #else
return; return;
#endif #endif
......
...@@ -413,6 +413,9 @@ TEST(Conv2d, precision) { ...@@ -413,6 +413,9 @@ TEST(Conv2d, precision) {
#if defined(LITE_WITH_NPU) #if defined(LITE_WITH_NPU)
place = TARGET(kNPU); place = TARGET(kNPU);
abs_error = 5e-2; // Using fp16 in NPU abs_error = 5e-2; // Using fp16 in NPU
#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
place = TARGET(kHuaweiAscendNPU);
abs_error = 5e-2; // Using fp16 in NPU
#else #else
return; return;
#endif #endif
......
...@@ -33,6 +33,9 @@ BUILD_APU=OFF ...@@ -33,6 +33,9 @@ BUILD_APU=OFF
APU_DDK_ROOT="$(pwd)/apu_sdk_lib/" APU_DDK_ROOT="$(pwd)/apu_sdk_lib/"
BUILD_RKNPU=OFF BUILD_RKNPU=OFF
RKNPU_DDK_ROOT="$(pwd)/rknpu/" RKNPU_DDK_ROOT="$(pwd)/rknpu/"
WITH_HUAWEI_ASCEND_NPU=OFF # Huawei Ascend Builder/Runtime Libs on X86 host
# default installation path, ensure acllib/atc/opp directories are all in this root dir
HUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5"
PYTHON_EXECUTABLE_OPTION="" PYTHON_EXECUTABLE_OPTION=""
readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
...@@ -364,6 +367,11 @@ function make_x86 { ...@@ -364,6 +367,11 @@ function make_x86 {
root_dir=$(pwd) root_dir=$(pwd)
build_directory=$BUILD_DIR/build.lite.x86 build_directory=$BUILD_DIR/build.lite.x86
if [ ${WITH_HUAWEI_ASCEND_NPU} == "ON" ]; then
export CXX=/usr/bin/g++ # Ascend need g++ in centos
build_directory=$BUILD_DIR/build.lite.huawei_ascend_npu
fi
if [ -d $build_directory ] if [ -d $build_directory ]
then then
rm -rf $build_directory rm -rf $build_directory
...@@ -390,6 +398,8 @@ function make_x86 { ...@@ -390,6 +398,8 @@ function make_x86 {
-DLITE_WITH_XPU=$BUILD_XPU \ -DLITE_WITH_XPU=$BUILD_XPU \
-DLITE_WITH_XTCL=$BUILD_XTCL \ -DLITE_WITH_XTCL=$BUILD_XTCL \
-DXPU_SDK_ROOT=$XPU_SDK_ROOT \ -DXPU_SDK_ROOT=$XPU_SDK_ROOT \
-DLITE_WITH_HUAWEI_ASCEND_NPU=$WITH_HUAWEI_ASCEND_NPU \
-DHUAWEI_ASCEND_NPU_DDK_ROOT=$HUAWEI_ASCEND_NPU_DDK_ROOT \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DPY_VERSION=$PY_VERSION \ -DPY_VERSION=$PY_VERSION \
$PYTHON_EXECUTABLE_OPTION $PYTHON_EXECUTABLE_OPTION
...@@ -558,6 +568,14 @@ function main { ...@@ -558,6 +568,14 @@ function main {
RKNPU_DDK_ROOT="${i#*=}" RKNPU_DDK_ROOT="${i#*=}"
shift shift
;; ;;
--with_huawei_ascend_npu=*)
WITH_HUAWEI_ASCEND_NPU="${i#*=}"
shift
;;
--huawei_ascend_npu_ddk_root=*)
HUAWEI_ASCEND_NPU_DDK_ROOT="${i#*=}"
shift
;;
tiny_publish) tiny_publish)
make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
shift shift
......
...@@ -399,6 +399,64 @@ function build_test_xpu { ...@@ -399,6 +399,64 @@ function build_test_xpu {
test_xpu test_xpu
} }
function cmake_huawei_ascend_npu {
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$PWD/third_party/install/mklml/lib"
prepare_workspace
cmake .. \
${common_flags} \
-DWITH_GPU=OFF \
-DWITH_MKLDNN=OFF \
-DLITE_WITH_X86=ON \
-DWITH_MKL=ON \
-DLITE_BUILD_EXTRA=ON \
-DLITE_WITH_HUAWEI_ASCEND_NPU=ON \
-DHUAWEI_ASCEND_NPU_DDK_ROOT="/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux_gcc4.8.5" \
-DCMAKE_BUILD_TYPE=Release
}
function build_huawei_ascend_npu {
make lite_compile_deps -j$NUM_CORES_FOR_COMPILE
}
# It will eagerly test all lite related unittests.
function test_huawei_ascend_npu {
# Due to the missing of ascend kernels, we skip the following tests temporarily.
# TODO(xxx) clear the skip list latter
local skip_list=("test_paddle_api" "test_cxx_api" "test_googlenet"
"test_mobilenetv1_lite_x86" "test_mobilenetv2_lite_x86"
"test_inceptionv4_lite_x86" "test_light_api"
"test_apis" "test_model_bin"
)
local to_skip=0
for _test in $(cat $TESTS_FILE); do
to_skip=0
for skip_name in ${skip_list[@]}; do
if [ $skip_name = $_test ]; then
echo "to skip " $skip_name
to_skip=1
fi
done
if [ $to_skip -eq 0 ]; then
ctest -R $_test -V
fi
done
}
# Build the code and run lite server tests. This is executed in the CI system.
function build_test_huawei_ascend_npu {
cur_dir=$(pwd)
build_dir=$cur_dir/build.lite.huawei_ascend_npu_test
mkdir -p $build_dir
cd $build_dir
cmake_huawei_ascend_npu
build_huawei_ascend_npu
test_huawei_ascend_npu
}
# test_arm_android <some_test_name> <adb_port_number> # test_arm_android <some_test_name> <adb_port_number>
function test_arm_android { function test_arm_android {
local test_name=$1 local test_name=$1
...@@ -1157,6 +1215,10 @@ function main { ...@@ -1157,6 +1215,10 @@ function main {
test_arm_android $TEST_NAME $ARM_PORT test_arm_android $TEST_NAME $ARM_PORT
shift shift
;; ;;
test_huawei_ascend_npu)
test_huawei_ascend_npu
shift
;;
build_test_cuda_server) build_test_cuda_server)
build_test_cuda_server build_test_cuda_server
shift shift
...@@ -1174,6 +1236,10 @@ function main { ...@@ -1174,6 +1236,10 @@ function main {
build_test_xpu build_test_xpu
shift shift
;; ;;
build_test_huawei_ascend_npu)
build_test_huawei_ascend_npu
shift
;;
build_test_train) build_test_train)
build_test_train build_test_train
shift shift
......
...@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = { ...@@ -56,8 +56,8 @@ const std::vector<std::vector<std::string>> supported_ops_target = {
ops_lines = [] ops_lines = []
# valid targets and valid_ops # valid targets and valid_ops
valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU"] valid_targets = ["kUnk", "kHost", "kX86", "kCUDA", "kARM", "kOpenCL", "kAny", "kFPGA", "kNPU", "kXPU", "kBM", "kMLU", "kRKNPU", "kAPU", "kHuaweiAscendNPU"]
valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[]] valid_ops = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]
class TargetType: class TargetType:
kUnk = 0 kUnk = 0
kHost = 1 kHost = 1
...@@ -73,6 +73,7 @@ class TargetType: ...@@ -73,6 +73,7 @@ class TargetType:
kMLU = 11 kMLU = 11
kRKNPU = 12 kRKNPU = 12
kAPU = 13 kAPU = 13
kHuaweiAscendNPU = 14
# record op_info of valid kernels into `valid_ops` according to different target type # record op_info of valid kernels into `valid_ops` according to different target type
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册