未验证 提交 0d7b2a32 编写于 作者: H HappyAngel 提交者: GitHub

Merge pull request #141 from PaddlePaddle/develop

pull 
......@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_HUAWEI_ASCEND_NPU "Enable HUAWEI_ASCEND_NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
......@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OF
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
......@@ -224,6 +226,11 @@ endif()
if(LITE_WITH_MLU)
include(mlu)
endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
include(device/huawei_ascend_npu)
endif()
include(coveralls)
include(external/mklml) # download mklml package
......
......@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
add_definitions("-DLITE_WITH_MLU")
endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
endif()
......@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
add_definitions("-DLITE_WITH_LOG")
endif()
if (LITE_WITH_EXCEPTION)
add_definitions("-DLITE_WITH_EXCEPTION")
endif()
if (LITE_ON_TINY_PUBLISH)
add_definitions("-DLITE_ON_TINY_PUBLISH")
endif()
......
......@@ -80,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(triple arm-v7a-linux-android)
set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
# Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
if (ANDROID_API_LEVEL LESS 21)
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
endif()
if(ANDROID_STL_TYPE STREQUAL "c++_shared")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
else()
message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
endif()
endif()
else()
message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
endif()
......
......@@ -23,6 +23,21 @@ if(ANDROID)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
# Don't re-export libgcc symbols
set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
# Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
# Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
endif()
endif()
if(ARMLINUX)
......@@ -59,14 +74,13 @@ function(check_linker_flag)
endfunction()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
else ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
endif()
if (LITE_ON_TINY_PUBLISH)
if((NOT LITE_WITH_PYTHON))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
endif()
if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
check_linker_flag(-Wl,--gc-sections)
endif()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
# 1. path to Huawei Ascend Install Path
if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
endif()
endif()
message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
# 2. Huawei Ascend include directory
set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
include_directories(${ACL_INCLUDE_DIR})
include_directories(${ATC_INCLUDE_DIR})
include_directories(${OPP_INCLUDE_DIR})
# 3 find ACL Libs (ACL libs should before ATC libs)
find_library(ACL_ASCENDCL_FILE NAMES ascendcl
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_ASCENDCL_FILE)
message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
endif()
# 3.1 ascendcl dependency - libruntime.so
find_library(ACL_RUNTIME_FILE NAMES runtime
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_RUNTIME_FILE)
message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
add_library(acl_runtime SHARED IMPORTED GLOBAL)
set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
endif()
# 4.1 find ATC libs - libregister.so
find_library(ATC_REGISTER_FILE NAMES register
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
add_library(atc_register SHARED IMPORTED GLOBAL)
set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
endif()
# 4.1.1 dependency of register - libprotobuf.so.19,
find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
add_library(atc_protobuf SHARED IMPORTED GLOBAL)
set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
endif()
# 4.1.2 dependency of register - libgraph.so
find_library(ATC_GRAPH_FILE NAMES graph
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GRAPH_FILE)
message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
add_library(atc_graph SHARED IMPORTED GLOBAL)
set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
endif()
# 4.2 find ATC libs - libge_compiler.so
find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMPILER_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
endif()
# 4.2.1 dependencies of libge_compiler.so - libge_common.so
find_library(ATC_GE_COMMON_FILE NAMES ge_common
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMMON_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
add_library(atc_ge_common SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
endif()
# 4.2.3 dependencies of libge_compiler.so - libresource.so
find_library(ATC_RESOURCE_FILE NAMES resource
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_RESOURCE_FILE)
message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
add_library(atc_resource SHARED IMPORTED GLOBAL)
set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
endif()
# 4.3 find OPP libs - libopsproto.so
find_library(OPP_OPS_PROTO_FILE NAMES opsproto
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
NO_DEFAULT_PATH)
if(NOT OPP_OPS_PROTO_FILE)
message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
else()
message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
endif()
# 4.3.1 dependency of opp_ops_proto - liberror_manager.so
find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_ERROR_MANAGER_FILE)
message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
add_library(atc_error_manager SHARED IMPORTED GLOBAL)
set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
endif()
# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager
atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
......@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
# Added in HiAI DDK 320 or later version
find_library(NPU_DDK_HCL_FILE NAMES hcl
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
if(NOT NPU_DDK_HIAI_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
else()
......@@ -78,5 +83,13 @@ else()
set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
endif()
set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
if(NOT NPU_DDK_HCL_FILE)
# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
else()
message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
endif()
set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
......@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -118,6 +118,12 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE)
endfunction()
......@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
if (args_SHARED OR ARGS_shared)
......@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
if(NOT WIN32)
......@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL # (basic|extra)
......@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size
......@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
set(apu_kernels CACHE INTERNAL "apu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels")
set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
......@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS})
......@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -499,7 +519,7 @@ endif()
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -537,6 +557,7 @@ function(add_operator TARGET level)
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......
......@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
......@@ -45,6 +46,7 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
......
......@@ -11,7 +11,7 @@ endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full)
......@@ -40,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if(WIN32)
target_link_libraries(paddle_light_api_shared shlwapi.lib)
endif()
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
if(APPLE)
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
......@@ -94,6 +95,7 @@ if (WITH_TESTING)
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
APU_DEPS ${apu_kernels})
endif()
......@@ -112,6 +114,10 @@ if(LITE_WITH_RKNPU)
set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
endif()
message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}")
......@@ -126,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}")
message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")
# for full api
if (NOT LITE_ON_TINY_PUBLISH)
......@@ -144,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
FPGA_DEPS ${fpga_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif()
# for light api
......@@ -168,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......@@ -191,6 +200,7 @@ if(WITH_TESTING)
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -322,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels})
BM_DEPS ${bm_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
# The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps})
......@@ -394,6 +405,7 @@ if(NOT WITH_COVERAGE)
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
......@@ -415,7 +427,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -430,7 +443,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -445,7 +459,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -459,7 +474,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -470,8 +486,9 @@ if(NOT IOS)
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -487,7 +504,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
......@@ -37,8 +37,7 @@ void Predictor::SaveModel(const std::string &dir,
if (!program_) {
GenRuntimeProgram();
}
program_->SaveOpInfosToProgram(program_desc_.get());
program_->UpdateVarsOfProgram(program_desc_.get());
program_->SaveToProgram(program_desc_);
switch (model_type) {
case lite_api::LiteModelType::kProtobuf:
SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
......@@ -58,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir,
void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
std::set<std::string> ops_info;
std::set<std::string> kernels_info;
const auto &instructions_ = program_->instructions();
for (auto &node : instructions_) {
// parse op type infomation
auto op = node.op()->op_info();
ops_info.insert(op->Type());
// parse kernel type information
std::string kernel_type_str =
node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
"," + PrecisionRepr(node.kernel()->precision()) + "," +
DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
kernels_info.insert(kernel_type_str);
auto block_size = program_->block_size();
for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
const auto &insts = program_->instructions(block_idx);
for (auto &inst : insts) {
// parse op type infomation
auto op = inst.op()->op_info();
ops_info.insert(op->Type());
// parse kernel type information
std::string kernel_type_str =
inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
"," + PrecisionRepr(inst.kernel()->precision()) + "," +
DataLayoutRepr(inst.kernel()->layout()) + "," +
inst.kernel()->alias();
kernels_info.insert(kernel_type_str);
}
}
// get souce_file name from op type and kernel type
......@@ -170,9 +173,9 @@ void Predictor::PrepareFeedFetch() {
std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
const auto &insts = program_->instructions(kRootBlockIdx);
for (auto &inst : insts) {
const auto &op = inst.op()->op_info();
if (op->Type() == "feed") {
feeds.push_back(op);
} else if (op->Type() == "fetch") {
......@@ -255,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
} else {
LOG(INFO) << "Load model from file.";
}
Build(model_path,
model_file,
param_file,
......@@ -296,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
Build(program_desc_, valid_places, passes);
}
void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) {
program_desc_ = desc;
program_desc_ = program_desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places;
for (auto &valid_place : valid_places) {
......@@ -336,7 +338,7 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
Place{TARGET(kARM), PRECISION(kInt8)});
}
Program program(*desc.get(), scope_, inner_places);
Program program(program_desc_, scope_, inner_places);
valid_places_ = inner_places;
core::KernelPickFactor factor;
......
......@@ -52,12 +52,12 @@ class LITE_API Predictor {
// Create a predictor with the weight variable scope set.
explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
: scope_(root_scope) {}
Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
const std::shared_ptr<Scope>& root,
Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::shared_ptr<Scope>& root_scope,
const std::vector<Place>& valid_places,
const std::vector<std::string>& var_names = {})
: program_desc_(desc), scope_(root) {
Program program(*desc.get(), scope_, valid_places, var_names);
const std::vector<std::string>& vars_to_clone = {})
: program_desc_(program_desc), scope_(root_scope) {
Program program(program_desc_, scope_, valid_places, vars_to_clone);
optimizer_ = Optimizer(std::move(program), valid_places);
exec_scope_ = optimizer_.exec_scope();
valid_places_ = valid_places;
......@@ -79,30 +79,28 @@ class LITE_API Predictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool memory_from_memory = false);
void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::vector<Place>& valid_places,
const std::vector<std::string>& passes = {});
std::shared_ptr<Predictor> Clone() const {
auto predictor =
std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
return predictor;
return std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
}
std::shared_ptr<Predictor> Clone(
const std::vector<std::string>& var_names) const {
const std::vector<std::string>& vars_to_clone) const {
CHECK(program_desc_) << "Both program and scope of current predicotr "
"should be not be nullptr in Clone mode.";
CHECK(scope_) << "Both program and scope of current predicotr should be "
"not be nullptr in Clone mode.";
auto predictor = std::make_shared<Predictor>(
program_desc_, scope_, valid_places_, var_names);
program_desc_, scope_, valid_places_, vars_to_clone);
for (auto i : var_names) {
predictor->exec_scope_->LocalVar(i);
auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
for (auto var_name : vars_to_clone) {
predictor->exec_scope_->LocalVar(var_name);
auto* tensor = predictor->scope_->Var(var_name)->GetMutable<Tensor>();
auto* sub_tensor =
predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
sub_tensor->CopyDataFrom(*tensor);
}
return predictor;
......@@ -140,6 +138,7 @@ class LITE_API Predictor {
// get a const tensor according to its name
const lite::Tensor* GetTensor(const std::string& name) const;
const RuntimeProgram& runtime_program() const;
Scope* scope() { return scope_.get(); }
// This method is disabled in mobile, for unnecessary dependencies required.
void SaveModel(
......
......@@ -74,7 +74,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
mode_ = config.power_mode();
threads_ = config.threads();
#ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
......
......@@ -22,16 +22,16 @@ namespace lite {
void LightPredictor::Build(const std::string& lite_model_file,
bool model_from_memory) {
if (model_from_memory) {
LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
LoadModelNaiveFromMemory(
lite_model_file, scope_.get(), program_desc_.get());
} else {
LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
}
// For weight quantization of post training, load the int8/16 weights
// for optimized model, and dequant it to fp32.
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_);
BuildRuntimeProgram(program_desc_);
PrepareFeedFetch();
}
......@@ -43,15 +43,15 @@ void LightPredictor::Build(const std::string& model_dir,
switch (model_type) {
#ifndef LITE_ON_TINY_PUBLISH
case lite_api::LiteModelType::kProtobuf:
LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
break;
#endif
case lite_api::LiteModelType::kNaiveBuffer: {
if (model_from_memory) {
LoadModelNaiveFromMemory(
model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
model_buffer, param_buffer, scope_.get(), program_desc_.get());
} else {
LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
}
break;
}
......@@ -60,7 +60,7 @@ void LightPredictor::Build(const std::string& model_dir,
}
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_);
BuildRuntimeProgram(program_desc_);
PrepareFeedFetch();
}
......@@ -109,16 +109,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
}
// append the names of inputs and outputs into input_names_ and output_names_
void LightPredictor::PrepareFeedFetch() {
const cpp::ProgramDesc& prog = cpp_program_desc_;
auto current_block = prog.GetBlock<cpp::BlockDesc>(0);
std::vector<cpp::OpDesc const*> feeds;
std::vector<cpp::OpDesc const*> fetchs;
for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i);
if (op->Type() == "feed") {
feeds.push_back(op);
} else if (op->Type() == "fetch") {
fetchs.push_back(op);
std::vector<const cpp::OpDesc*> feeds;
std::vector<const cpp::OpDesc*> fetchs;
std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
auto op_size = main_block->OpsSize();
for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
if (op_desc->Type() == "feed") {
feeds.push_back(op_desc);
} else if (op_desc->Type() == "fetch") {
fetchs.push_back(op_desc);
}
}
input_names_.resize(feeds.size());
......@@ -133,55 +134,35 @@ void LightPredictor::PrepareFeedFetch() {
}
}
void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
std::vector<Instruction> insts;
// 1. Create op first
Program program(prog, scope_, {});
// 2. Create Instructs
#ifdef LITE_WITH_OPENCL
using OpenCLContext = Context<TargetType::kOpenCL>;
std::unique_ptr<KernelContext> local_ctx(new KernelContext());
local_ctx->As<OpenCLContext>().InitOnce();
#endif
// Create the kernels of the target places, and filter out the specific
// kernel with the target alias.
for (auto& op : program.ops()) {
auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
std::string op_type, alias;
Place place;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
auto kernels = op->CreateKernels({place});
// filter out a kernel
auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
return it->alias() == alias;
});
CHECK(it != kernels.end());
#ifdef LITE_WITH_OPENCL
if ((*it)->target() == TARGET(kOpenCL)) {
std::unique_ptr<KernelContext> ctx(new KernelContext());
(*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
(*it)->SetContext(std::move(ctx));
} else {
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
void LightPredictor::BuildRuntimeProgram(
const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
auto* exe_scope = &scope_->NewScope();
// Prepare workspace
scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
CHECK(program_desc);
auto block_size = program_desc->BlocksSize();
CHECK(block_size);
for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
auto var_size = block_desc->VarsSize();
for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
if (!var_desc->Persistable()) {
exe_scope->Var(var_desc->Name());
} else {
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
scope_->Var(var_desc->Name());
}
}
#else
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
#endif
insts.emplace_back(op, std::move(*it));
}
program_.reset(new RuntimeProgram(std::move(insts)));
CHECK(program.exec_scope());
program_->set_exec_scope(program.exec_scope());
// Only extracting the ops and generate the runtime program from the main
// block desc
program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
}
void LightPredictor::DequantizeWeight() {
const cpp::ProgramDesc& cpp_desc = cpp_program_desc_;
std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
#define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < ch; ++i) { \
for (int64_t j = 0; j < offset; ++j) { \
......@@ -207,10 +188,9 @@ void LightPredictor::DequantizeWeight() {
}
return result;
};
Tensor tmp_tensor;
for (size_t i = 0; i < cpp_desc.BlocksSize(); i++) {
auto* block = cpp_desc.GetBlock<cpp::BlockDesc>(i);
for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
for (size_t k = 0; k < block->OpsSize(); ++k) {
auto* op_desc = block->GetOp<cpp::OpDesc>(k);
if (is_weight_quantized_op(op_desc)) {
......
......@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
LightPredictor(const std::string& lite_model_file,
bool model_from_memory = false) {
scope_ = std::make_shared<Scope>();
program_desc_ = std::make_shared<cpp::ProgramDesc>();
Build(lite_model_file, model_from_memory);
}
......@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
lite_api::LiteModelType model_type =
lite_api::LiteModelType::kNaiveBuffer) {
scope_ = std::make_shared<Scope>();
program_desc_ = std::make_shared<cpp::ProgramDesc>();
Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
}
......@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames();
void PrepareFeedFetch();
Scope* scope() { return scope_.get(); }
private:
void Build(const std::string& lite_model_file,
......@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool model_from_memory = false);
void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
void BuildRuntimeProgram(
const std::shared_ptr<const cpp::ProgramDesc>& program_desc);
void DequantizeWeight();
private:
std::shared_ptr<Scope> scope_;
std::unique_ptr<RuntimeProgram> program_;
cpp::ProgramDesc cpp_program_desc_;
std::shared_ptr<cpp::ProgramDesc> program_desc_;
std::vector<std::string> input_names_;
std::vector<std::string> output_names_;
};
......
......@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
threads_ = config.threads();
#ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
}
......
......@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
} else if (target_repr == "npu") {
valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") {
......@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kXPU",
"kRKNPU",
"kAPU",
"kHuaweiAscendNPU",
"kAny",
"kUnk"};
int maximum_optype_length = 0;
......@@ -265,16 +268,17 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" "
"`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n"
" `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
......
......@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") {
valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") {
......@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
" `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
"default\n"
" `set_lite_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" "
"`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `record_model_info(false|true)`: refer to whether to record ops "
"info for striping lib, false by default`\n"
" `run() : start model transformation`\n"
......@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
" `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n"
" `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display operators in the input model\n";
std::cout << "paddlelite opt version:" << opt_version << std::endl
<< help_info << std::endl;
......@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
"kXPU",
"kRKNPU",
"kAPU",
"kHuaweiAscendNPU",
"kAny",
"kUnk"};
// Get the lengh of the first column: maximum length of the op_type
......
......@@ -126,6 +126,7 @@ class LITE_API ConfigBase {
PowerMode mode_{LITE_POWER_NO_BIND};
// to save subgraph model for npu/xpu/...
std::string subgraph_model_cache_dir_{""};
int device_id_{0};
public:
explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
......@@ -145,6 +146,9 @@ class LITE_API ConfigBase {
const std::string& subgraph_model_cache_dir() const {
return subgraph_model_cache_dir_;
}
// set Device ID
void set_device_id(int device_id) { device_id_ = device_id; }
const int get_device_id() const { return device_id_; }
};
/// CxxConfig is the config for the Full feature predictor.
......
......@@ -76,7 +76,8 @@ const std::string& TargetToStr(TargetType target) {
"bm",
"mlu",
"rknpu",
"apu"};
"apu",
"huawei_ascend_npu"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -119,7 +120,8 @@ const std::string& TargetRepr(TargetType target) {
"kBM",
"kMLU",
"kRKNPU",
"kAPU"};
"kAPU",
"kHuaweiAscendNPU"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -164,7 +166,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kMLU),
TARGET(kAPU),
TARGET(kRKNPU),
TARGET(kFPGA)});
TARGET(kFPGA),
TARGET(kHuaweiAscendNPU)});
if (target == TARGET(kAny)) {
return valid_set;
}
......
......@@ -57,7 +57,8 @@ enum class TargetType : int {
kMLU = 11,
kRKNPU = 12,
kAPU = 13,
NUM = 14, // number of fields.
kHuaweiAscendNPU = 14,
NUM = 15, // number of fields.
};
enum class PrecisionType : int {
kUnk = 0,
......
......@@ -48,12 +48,14 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(multi_stream_analysis_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
......
......@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
.value("MLU", TargetType::kMLU)
.value("RKNPU", TargetType::kRKNPU)
.value("APU", TargetType::kAPU)
.value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
.value("Any", TargetType::kAny);
// PrecisionType
......
......@@ -10,3 +10,4 @@ add_subdirectory(mlu)
add_subdirectory(bm)
add_subdirectory(apu)
add_subdirectory(rknpu)
add_subdirectory(huawei_ascend_npu)
......@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
selected_ids->Resize(dims);
selected_scores->Resize(dims);
if (parent_idx) {
parent_idx->Resize(dims);
parent_idx->Resize({static_cast<int64_t>(num_instances)});
}
auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
auto *selected_scores_data = selected_scores->mutable_data<float>();
......
......@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
}
}
template <>
void elementwise_mul<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
dout[i] = dinx[i] * diny[i];
}
}
template <>
void elementwise_mul_relu<float>(const float* dinx,
const float* diny,
......@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
}
}
template <>
void elementwise_mul_relu<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
int64_t tmp = dinx[i] * diny[i];
dout[i] = tmp > 0 ? tmp : 0;
}
}
template <>
void elementwise_mul_broadcast<float>(const float* dinx,
const float* diny,
......@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
}
}
template <>
void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* dinx_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *dinx_ptr * diny_data;
dout_ptr++;
dinx_ptr++;
}
}
}
}
template <>
void elementwise_mul_relu_broadcast<float>(const float* dinx,
const float* diny,
......@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
}
}
template <>
void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* dinx_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int k = 0; k < num; ++k) {
int64_t tmp = *dinx_ptr * diny_data;
*dout_ptr = tmp > 0 ? tmp : 0;
dout_ptr++;
dinx_ptr++;
}
}
}
}
template <>
void elementwise_max<float>(const float* dinx,
const float* diny,
......
......@@ -21,7 +21,7 @@ namespace lite {
namespace arm {
namespace math {
const int MALLOC_ALIGN = 64;
const int MALLOC_ALIGN = 16;
void* fast_malloc(size_t size) {
size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
......
......@@ -30,9 +30,16 @@ namespace lite {
namespace cuda {
namespace math {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
template <typename Dtype>
inline __device__ Dtype Sigmoid(const Dtype a) {
return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-a));
const Dtype min = SIGMOID_THRESHOLD_MIN;
const Dtype max = SIGMOID_THRESHOLD_MAX;
Dtype tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-tmp));
}
template <>
......@@ -63,6 +70,7 @@ inline __device__ half ReLU(const half a) {
template <typename Dtype>
inline __device__ Dtype Tanh(const Dtype a) {
Dtype tmp = static_cast<Dtype>(-2.0) * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
static_cast<Dtype>(1.0);
}
......
......@@ -22,10 +22,6 @@ namespace lite {
namespace cuda {
namespace math {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void scale_kernel(int count,
const T* in_data,
......@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
template <typename T>
__global__ void scale_kernel(
int count, const T* in_data, T* out_data, const T scale, const T bias) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
}
......@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
}
template <typename T>
void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) {
void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
int thread = 256;
int block = (num + thread - 1) / thread;
scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template <typename T>
......@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
int thread = 256;
int block = (num + thread - 1) / thread;
scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template void scale(int num, const float*, float*, float, cudaStream_t, float);
template void scale(int num, const float*, float*, float, float, cudaStream_t);
template void scale(int num, const float*, float*, float, float);
} // namespace math
......
......@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
cudaStream_t stream);
template <typename T>
void scale(
int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);
template <typename T>
void scale(int num, const T* in, T* out, T scale, T bias = 0);
......
......@@ -32,7 +32,7 @@ __global__ void CopyMatrixRowsKernel(const T* src,
bool is_src_index) {
int idx = threadIdx.x;
int idy = threadIdx.y;
int row_id = blockDim.y * gridDim.x + idy;
int row_id = blockDim.y * blockIdx.x + idy;
if (row_id < height) {
int src_idx = is_src_index ? index[row_id] : row_id;
int dst_idx = is_src_index ? row_id : index[row_id];
......@@ -72,7 +72,7 @@ void CopyMatrixRowsFunctor<T>::operator()(
dim3 threads(128, 8);
dim3 grids((height + threads.y - 1) / threads.y);
CopyMatrixRowsKernel<T><<<grids, threads, 0, stream>>>(
src_data, dst_data, index_tensor_data, height, width, true);
src_data, dst_data, index_tensor_data, height, width, is_src_index);
CUDA_POST_KERNEL_CHECK;
}
......
......@@ -53,11 +53,11 @@ class LoDTensor2BatchFunctor {
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
struct SeqInfo {
SeqInfo(size_t start, size_t length, size_t seq_idx)
: start_(start), length_(length), seq_idx_(seq_idx) {}
size_t start_;
size_t length_;
size_t seq_idx_;
SeqInfo(size_t start_val, size_t len_val, size_t seq_val)
: start(start_val), length(len_val), seq_idx(seq_val) {}
size_t start;
size_t length;
size_t seq_idx;
};
public:
......@@ -76,7 +76,7 @@ class LoDTensor2BatchFunctor {
}
std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
return a.length_ > b.length_;
return a.length > b.length;
});
// Calculate the start position of each batch.
......@@ -106,7 +106,7 @@ class LoDTensor2BatchFunctor {
batch_lods.emplace_back(std::vector<uint64_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor
size_t max_seqlen = seq_info[0].length_;
size_t max_seqlen = seq_info[0].length;
batch_lods[0].resize(max_seqlen + 1);
// batch_lods[1] is the raw index in the input LoDTensor
batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
......@@ -119,8 +119,8 @@ class LoDTensor2BatchFunctor {
for (size_t n = 0; n < max_seqlen; ++n) {
size_t batch_id = batch_starts[n];
for (size_t i = 0; i < seq_info.size(); ++i) {
size_t seq_len = seq_info[i].length_;
size_t start = seq_info[i].start_;
size_t seq_len = seq_info[i].length;
size_t start = seq_info[i].start;
if (n < seq_len) {
seq2batch_idx[batch_id] =
is_reverse ? start + seq_len - 1 - n : start + n;
......@@ -133,7 +133,7 @@ class LoDTensor2BatchFunctor {
}
auto* seq_order = batch_lods[2].data();
for (size_t i = 0; i < seq_info.size(); ++i) {
seq_order[i] = seq_info[i].seq_idx_;
seq_order[i] = seq_info[i].seq_idx;
}
batch_tensor->set_lod(batch_lods);
......
......@@ -86,8 +86,7 @@ void SequencePadding(T* pad_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template <typename T>
......@@ -120,8 +119,7 @@ void SequenceUnpadding(T* seq_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template void SequencePadding(float* pad_data,
......
......@@ -19,7 +19,7 @@
namespace paddle {
namespace lite {
const int MALLOC_ALIGN = 64;
const int MALLOC_ALIGN = 16;
void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
......@@ -30,7 +30,6 @@ void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
(~(MALLOC_ALIGN - 1)));
static_cast<void**>(r)[-1] = p;
memset(r, 0, size);
return r;
}
void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {
......
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/huawei_ascend_npu/device.h"
#include <map>
#include <utility>
#include "ge/ge_api_types.h"
#include "ge/ge_ir_build.h"
#include "graph/graph.h"
#include "lite/utils/io.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
std::shared_ptr<AclModelClient> Device::LoadFromMem(
const std::vector<char>& model_buffer, const int device_id) {
if (model_buffer.size() == 0) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!";
return nullptr;
}
// Create a ACL model client to load the om model
std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
// Load model from memory
if (model_client->LoadFromMem(
reinterpret_cast<const void*>(model_buffer.data()),
model_buffer.size())) {
return model_client;
}
return nullptr;
}
std::shared_ptr<AclModelClient> Device::LoadFromFile(
const std::string& model_path, const int device_id) {
if (!paddle::lite::IsFileExists(model_path)) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path;
return nullptr;
}
// Create a ACL model client to load the om model
std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
// Load model from memory
if (model_client->LoadFromFile(model_path.c_str())) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
return model_client;
}
return nullptr;
}
std::mutex Device::device_mutex_;
bool Device::Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer) {
std::lock_guard<std::mutex> lock(device_mutex_);
// Convert the HiAI IR graph to the HiAI om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
// Build IR model
ge::ModelBufferData om_buffer;
std::map<std::string, std::string> options;
options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error"));
ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer));
// Copy from om model buffer
model_buffer->resize(om_buffer.length);
memcpy(reinterpret_cast<void*>(model_buffer->data()),
reinterpret_cast<void*>(om_buffer.data.get()),
om_buffer.length);
return true;
}
void Device::InitOnce() {
if (runtime_inited_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!";
return;
}
// ACL runtime init => can only be called once in one process
ACL_CALL(aclInit(NULL));
// ATC builder init => can only be called once in one process
std::map<std::string, std::string> global_options;
global_options.insert(
std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310"));
ATC_CALL(ge::aclgrphBuildInitialize(global_options));
runtime_inited_ = true;
}
void Device::DestroyOnce() {
if (!runtime_inited_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!";
return;
}
// ATC builder finalize => can only be called once in one process
ge::aclgrphBuildFinalize();
// ACL runtime finalize => can only be called once in one process
ACL_CALL(aclFinalize());
runtime_inited_ = false;
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <vector>
#include "lite/backends/huawei_ascend_npu/model_client.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() { InitOnce(); }
~Device() { DestroyOnce(); }
std::shared_ptr<AclModelClient> LoadFromMem(
const std::vector<char>& model_buffer, const int device_id);
std::shared_ptr<AclModelClient> LoadFromFile(const std::string& model_path,
const int device_id);
// Build the ACL IR graph to the ACL om model
bool Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer); // NOLINT
private:
void InitOnce();
void DestroyOnce();
bool runtime_inited_{false};
static std::mutex device_mutex_;
};
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/huawei_ascend_npu/model_client.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
bool AclModelClient::LoadFromMem(const void* data, uint32_t size) {
if (load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
return true;
}
auto ret = aclmdlQuerySizeFromMem(
data, size, &model_memory_size_, &model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!";
return false;
}
ret = aclrtMalloc(
&model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
"failed, require size is "
<< model_memory_size_;
return false;
}
ret = aclrtMalloc(
&model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
"failed, require size is "
<< model_weight_size_;
return false;
}
ret = aclmdlLoadFromMemWithMem(data,
size,
&model_id_,
model_memory_ptr_,
model_memory_size_,
model_weight_ptr_,
model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
return false;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
return false;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success.";
load_flag_ = true;
return true;
}
bool AclModelClient::LoadFromFile(const char* model_path) {
if (load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
return true;
}
auto ret =
aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!";
return false;
}
ret = aclrtMalloc(
&model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
"failed, require size is "
<< model_memory_size_;
return false;
}
ret = aclrtMalloc(
&model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
"failed, require size is "
<< model_weight_size_;
return false;
}
ret = aclmdlLoadFromFileWithMem(model_path,
&model_id_,
model_memory_ptr_,
model_memory_size_,
model_weight_ptr_,
model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!";
return false;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
return false;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
load_flag_ = true;
return true;
}
bool AclModelClient::GetModelIOTensorDim(
std::vector<TensorDesc>* input_tensor,
std::vector<TensorDesc>* output_tensor) {
if (!model_desc_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!";
return false;
}
size_t input_num = aclmdlGetNumInputs(model_desc_);
VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num;
for (size_t i = 0; i < input_num; i++) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ....";
aclmdlIODims input_dim;
aclmdlGetInputDims(model_desc_, i, &input_dim);
aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is "
<< data_type;
aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is "
<< data_format;
TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
input_tensor->push_back(tensor_desc);
}
size_t output_num = aclmdlGetNumOutputs(model_desc_);
VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num;
for (size_t i = 0; i < output_num; i++) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ....";
aclmdlIODims output_dim;
aclmdlGetOutputDims(model_desc_, i, &output_dim);
aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is "
<< data_type;
aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is "
<< data_format;
TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
output_tensor->push_back(tensor_desc);
}
return true;
}
bool AclModelClient::GetTensorFromDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
if (device_output_num != tensor_output_num) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
<< device_output_num << "tensor number is " << tensor_output_num;
return false;
}
for (size_t i = 0; i < device_output_num; i++) {
aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
void* device_data = aclGetDataBufferAddr(buffer_device);
uint32_t device_size = aclGetDataBufferSize(buffer_device);
void* tensor_data = nullptr;
aclError ret = aclrtMallocHost(&tensor_data, device_size);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret;
return false;
}
ret = aclrtMemcpy(tensor_data,
device_size,
device_data,
device_size,
ACL_MEMCPY_DEVICE_TO_HOST);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret;
return false;
}
if (output_tensor->at(i)->SetData(reinterpret_cast<uint8_t*>(tensor_data),
device_size) != ge::GRAPH_SUCCESS) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed";
return false;
}
}
VLOG(3)
<< "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed.";
return true;
}
void AclModelClient::CreateInputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor) {
input_dataset_ = aclmdlCreateDataset();
if (input_dataset_ == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!";
return;
}
for (size_t i = 0; i < input_tensor->size(); i++) {
auto item = input_tensor->at(i);
size_t buffer_size = item->GetSize();
void* buffer_device = nullptr;
aclError ret =
aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is "
<< buffer_size;
return;
}
void* buffer_data = reinterpret_cast<void*>(item->GetData());
ret = aclrtMemcpy(buffer_device,
buffer_size,
buffer_data,
buffer_size,
ACL_MEMCPY_HOST_TO_DEVICE);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is "
<< buffer_size;
aclrtFree(buffer_device);
return;
}
aclDataBuffer* data_buffer =
aclCreateDataBuffer(buffer_device, buffer_size);
if (data_buffer == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
aclrtFree(buffer_device);
return;
}
if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!";
aclrtFree(buffer_device);
aclDestroyDataBuffer(data_buffer);
return;
}
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed.";
}
void AclModelClient::CreateOutputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
output_dataset_ = aclmdlCreateDataset();
if (output_dataset_ == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!";
return;
}
size_t output_size = aclmdlGetNumOutputs(model_desc_);
CHECK_EQ(output_size, output_tensor->size());
for (size_t i = 0; i < output_size; i++) {
size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
void* buffer_device = nullptr;
aclError ret =
aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is "
<< buffer_size;
return;
}
aclDataBuffer* data_buffer =
aclCreateDataBuffer(buffer_device, buffer_size);
if (data_buffer == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
aclrtFree(buffer_device);
return;
}
if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) !=
ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!";
aclrtFree(buffer_device);
aclDestroyDataBuffer(data_buffer);
return;
}
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed.";
}
bool AclModelClient::ModelExecute(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
// check model exists
if (model_desc_ == nullptr) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] no model description, model execution failed!";
return false;
}
// create input/output dataset
CreateInputDataset(input_tensor);
CreateOutputDataset(output_tensor);
// model execution
ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_));
// get output
if (!GetTensorFromDataset(output_tensor)) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:"
<< model_id_;
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
<< model_id_;
return true;
}
void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
if (*dataset == nullptr) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!";
return;
}
size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset);
for (size_t i = 0; i < dataset_num; i++) {
aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i);
void* device_data = aclGetDataBufferAddr(buffer_device);
if (device_data == nullptr) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!";
} else {
if (aclrtFree(device_data) != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!";
}
}
if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!";
}
}
if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!";
}
*dataset = nullptr;
VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
}
bool AclModelClient::UnloadModel() {
if (!load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
<< load_flag_;
return true;
}
DestroyDataset(&input_dataset_);
DestroyDataset(&output_dataset_);
aclError ret = aclmdlUnload(model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "unload model failed, model id is " << model_id_;
return false;
}
if (model_desc_ != nullptr) {
(void)aclmdlDestroyDesc(model_desc_);
model_desc_ = nullptr;
}
if (model_memory_ptr_ != nullptr) {
aclrtFree(model_memory_ptr_);
model_memory_ptr_ = nullptr;
model_memory_size_ = 0;
}
if (model_weight_ptr_ != nullptr) {
aclrtFree(model_weight_ptr_);
model_weight_ptr_ = nullptr;
model_weight_size_ = 0;
}
load_flag_ = false;
VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
return true;
}
uint32_t AclModelClient::num_devices() {
uint32_t count = 0;
ACL_CALL(aclrtGetDeviceCount(&count));
return count;
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/huawei_ascend_npu/utils.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
class TensorDesc {
public:
TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
if (format == ACL_FORMAT_NHWC) {
dim_order[1] = 3;
dim_order[2] = 1;
dim_order[3] = 2;
}
// create ge::Tensordesc
ge_tensor_desc_ = new ge::TensorDesc(
GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
CHECK(ge_tensor_desc_ != nullptr);
}
~TensorDesc() { ge_tensor_desc_ = nullptr; }
int64_t GetNumber() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[0]);
}
int64_t GetChannel() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[1]);
}
int64_t GetHeight() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[2]);
}
int64_t GetWidth() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[3]);
}
const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; }
private:
ge::Shape GetGeShape(aclmdlIODims dims) {
ge::Shape ge_shape({0, 0, 0, 0});
for (size_t i = 0; i < dims.dimCount; i++) {
if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!";
} else {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <"
<< dims.dims[i] << ">";
}
}
return ge_shape;
}
ge::Format GetGeFormat(aclFormat format) {
ge::Format ge_format = ge::FORMAT_NCHW;
switch (format) {
case ACL_FORMAT_NCHW:
ge_format = ge::FORMAT_NCHW;
break;
case ACL_FORMAT_NHWC:
ge_format = ge::FORMAT_NHWC;
break;
case ACL_FORMAT_ND:
ge_format = ge::FORMAT_ND;
break;
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format;
break;
}
return ge_format;
}
ge::DataType GetGeDataType(aclDataType data_type) {
ge::DataType ge_datatype = ge::DT_FLOAT;
switch (data_type) {
case ACL_FLOAT:
ge_datatype = ge::DT_FLOAT;
break;
case ACL_FLOAT16:
ge_datatype = ge::DT_FLOAT16;
break;
case ACL_INT8:
ge_datatype = ge::DT_INT8;
break;
case ACL_INT16:
ge_datatype = ge::DT_INT16;
break;
case ACL_INT32:
ge_datatype = ge::DT_INT32;
break;
case ACL_INT64:
ge_datatype = ge::DT_INT64;
break;
case ACL_BOOL:
ge_datatype = ge::DT_BOOL;
break;
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!";
break;
}
return ge_datatype;
}
private:
ge::TensorDesc* ge_tensor_desc_{nullptr};
// n c h w order, default to ACL_FORMAT_NCHW
std::vector<size_t> dim_order{0, 1, 2, 3};
};
class AclModelClient {
public:
explicit AclModelClient(int device_id) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: "
<< device_id;
device_num_ = num_devices();
if (device_id < 0 || device_id >= device_num_) {
LOG(FATAL) << "Failed with invalid device id " << device_id;
return;
}
device_id_ = device_id;
ACL_CALL(aclrtSetDevice(device_id_));
}
~AclModelClient() {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
<< device_id_;
ACL_CALL(aclrtResetDevice(device_id_));
}
bool LoadFromMem(const void* data, uint32_t size);
bool LoadFromFile(const char* model_path);
bool GetModelIOTensorDim(std::vector<TensorDesc>* input_tensor,
std::vector<TensorDesc>* output_tensor);
bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
bool UnloadModel();
private:
void CreateInputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor);
void CreateOutputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
bool GetTensorFromDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
void DestroyDataset(aclmdlDataset** dataset);
private:
uint32_t num_devices();
private:
int device_id_{0};
int device_num_{0};
aclrtContext context_{nullptr};
bool load_flag_{false};
uint32_t model_id_{0};
size_t model_memory_size_;
size_t model_weight_size_;
void* model_memory_ptr_;
void* model_weight_ptr_;
aclmdlDesc* model_desc_{nullptr};
aclmdlDataset* input_dataset_{nullptr};
aclmdlDataset* output_dataset_{nullptr};
};
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "acl/acl.h"
#include "ge/ge_api_types.h"
#include "ge/ge_ir_build.h"
#include "graph/ge_error_codes.h"
#include "graph/graph.h"
#include "graph/tensor.h"
#include "graph/types.h"
#include "lite/utils/cp_logging.h"
/*
* This file contains some Huawei Ascend NPU specific uitls.
*/
#define ACL_CALL(msg) \
CHECK_EQ(reinterpret_cast<aclError>(msg), ACL_ERROR_NONE) \
<< (msg) << " Huawei Ascend NPU ACL Error: " \
<< ::paddle::lite::huawei_ascend_npu::AclErrorInfo( \
reinterpret_cast<int>(msg))
#define ATC_CALL(msg) \
CHECK_EQ(reinterpret_cast<ge::graphStatus>(msg), ge::GRAPH_SUCCESS) \
<< (msg) << " Huawei Ascend NPU ATC Error: " \
<< ::paddle::lite::huawei_ascend_npu::AtcErrorInfo( \
reinterpret_cast<uint32_t>(msg))
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
static const char* AtcErrorInfo(uint32_t error) {
switch (error) {
#define LITE_ATC_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED); // 0xFFFFFFFF
LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID); // 50331649
#undef LITE_ATC_ERROR_INFO
default:
return "unknown error";
break;
}
}
static const char* AclErrorInfo(int error) {
switch (error) {
#define LITE_ACL_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM); // 100000
LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE); // 100001
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE); // 100002
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE); // 100003
LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE); // 100004
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE); // 100005
LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE); // 100006
LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR); // 100007
LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID); // 100008
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG); // 100009
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG); // 100010
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID); // 100011
LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL); // 100012
LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL); // 100013
LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE); // 100014
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID); // 100015
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR); // 100016
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH); // 100017
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH); // 100018
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC); // 100019
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH); // 100020
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH); // 100021
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH); // 100022
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH); // 100023
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND); // 100024
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED); // 100025
LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE); // 100026
LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH); // 100027
LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED); // 100028
LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND); // 100029
LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED); // 100030
LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED); // 100031
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID); // 100032
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE); // 100033
LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE); // 100034
LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE); // 100035
LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT); // 100036
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE); // 100037
LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP); // 100038
LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC); // 200000
LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT); // 200001
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE); // 200002
LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED); // 200003
LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH); // 200004
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE); // 200005
LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED); // 200006
LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT); // 300000
LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR); // 500000
LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE); // 500001
LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE); // 500002
LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE); // 500003
LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE); // 500004
LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE); // 500005
#undef LITE_ACL_ERROR_INFO
default:
return "unknown error";
break;
}
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
......@@ -33,7 +33,7 @@ std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
// Check HiAI DDK version
const char* ddk_version = model_client->GetVersion();
if (ddk_version) {
LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version;
} else {
LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
}
......
......@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -24,7 +24,7 @@ namespace arena {
void TestCase::CreateInstruction() {
std::shared_ptr<lite::OpLite> op = nullptr;
static const std::set<TargetType> subgraph_op_supported_targets(
{TARGET(kNPU), TARGET(kXPU)});
{TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)});
bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
subgraph_op_supported_targets.end();
#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
......@@ -32,25 +32,35 @@ void TestCase::CreateInstruction() {
#endif
if (enable_subgraph_op) {
// Create a new block desc to wrap the original op desc
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
int sub_block_idx = 0;
auto sub_block_desc = new cpp::BlockDesc();
auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
sub_block_desc->ClearOps();
sub_block_desc->ClearVars();
auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_block_op_desc = *op_desc_;
auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_op_desc = *op_desc_;
// Add the block desc into the subgraph op which used to replace the
// original op
op_desc_.reset(new cpp::OpDesc());
op_desc_->SetType("subgraph");
op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
auto in_names = sub_block_op_desc->input_vars();
auto out_names = sub_block_op_desc->output_vars();
auto in_names = sub_op_desc->input_vars();
auto out_names = sub_op_desc->output_vars();
op_desc_->SetInput("Inputs", in_names);
op_desc_->SetOutput("Outputs", out_names);
op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
// filter only data op (not const op by persisiable)
std::vector<std::string> in_data_names;
for (auto name : in_names) {
if (!(inst_scope_->FindTensor(name)->persistable())) {
in_data_names.push_back(name);
}
}
op_desc_->SetAttr<std::vector<std::string>>("input_data_names",
in_data_names);
op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
op = LiteOpRegistry::Global().Create(op_desc().Type());
static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
sub_program_desc);
} else {
op = LiteOpRegistry::Global().Create(op_desc().Type());
}
......@@ -60,7 +70,7 @@ void TestCase::CreateInstruction() {
// filter out the target kernel
CHECK(!kernels.empty()) << "No kernel found for place "
<< place_.DebugString();
auto it = std::remove_if(
auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
return k->alias() == alias_;
});
......@@ -234,19 +244,6 @@ bool TestCase::CheckPrecision(const std::string& var_name,
return success;
}
TestCase::~TestCase() {
if (op_desc_->Type() == "subgraph") {
// Release the subblock desc of Subgraph op
auto subgraph_op = const_cast<operators::SubgraphOp*>(
static_cast<const operators::SubgraphOp*>(instruction_->op()));
CHECK(subgraph_op);
auto sub_block_desc = subgraph_op->GetSubBlock();
if (sub_block_desc) {
delete sub_block_desc;
}
}
}
} // namespace arena
} // namespace lite
} // namespace paddle
......@@ -46,7 +46,7 @@ class TestCase {
base_scope_(new Scope) {
ctx_ = ContextScheduler::Global().NewContext(place_.target);
}
virtual ~TestCase();
virtual ~TestCase() {}
void Prepare() {
PrepareData();
......
......@@ -17,8 +17,13 @@
namespace paddle {
namespace lite {
#ifdef LITE_WITH_NPU
std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""}; // NOLINT
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
thread_local std::string
Context<TargetType::kHuaweiAscendNPU>::subgraph_model_cache_dir_{
""}; // NOLINT
thread_local int
Context<TargetType::kHuaweiAscendNPU>::huawei_ascend_device_id_{
0}; // NOLINT
#endif
#ifdef LITE_WITH_MLU
......
......@@ -39,6 +39,7 @@
#include <utility>
#include <vector>
#include "lite/core/device_info.h"
#include "lite/core/scope.h"
#include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h"
#include "lite/utils/all.h"
......@@ -61,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>;
using BMContext = Context<TargetType::kBM>;
using MLUContext = Context<TargetType::kMLU>;
using RKNPUContext = Context<TargetType::kRKNPU>;
using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;
template <>
class Context<TargetType::kHost> {
......@@ -84,6 +86,35 @@ class Context<TargetType::kNPU> {
NPUContext& operator=(const NPUContext& ctx) {}
std::string name() const { return "NPUContext"; }
static void SetSubgraphModelCacheDir(Scope* scope,
std::string subgraph_model_cache_dir) {
auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
CHECK(var);
auto data = var->GetMutable<std::string>();
CHECK(data);
*data = subgraph_model_cache_dir;
}
static std::string SubgraphModelCacheDir(Scope* scope) {
auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
if (!var) return "";
return var->Get<std::string>();
}
};
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
template <>
class Context<TargetType::kHuaweiAscendNPU> {
public:
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(HuaweiAscendNPUContext* ctx) {}
HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) {
return *this;
}
std::string name() const { return "HuaweiAscendNPUContext"; }
static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
subgraph_model_cache_dir_ = subgraph_model_cache_dir;
}
......@@ -91,8 +122,14 @@ class Context<TargetType::kNPU> {
return subgraph_model_cache_dir_;
}
static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) {
huawei_ascend_device_id_ = huawei_ascend_device_id;
}
static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; }
private:
static std::string subgraph_model_cache_dir_;
static thread_local std::string subgraph_model_cache_dir_;
static thread_local int huawei_ascend_device_id_;
};
#endif
......@@ -385,6 +422,13 @@ class ContextScheduler {
&ctx->As<NPUContext>());
break;
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
case TARGET(kHuaweiAscendNPU):
kernel_contexts_[TargetType::kHuaweiAscendNPU]
.As<HuaweiAscendNPUContext>()
.CopySharedTo(&ctx->As<HuaweiAscendNPUContext>());
break;
#endif
#ifdef LITE_WITH_APU
case TARGET(kAPU):
kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
......@@ -466,6 +510,9 @@ class ContextScheduler {
#ifdef LITE_WITH_NPU
InitContext<TargetType::kNPU, NPUContext>();
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
InitContext<TargetType::kHuaweiAscendNPU, HuaweiAscendNPUContext>();
#endif
#ifdef LITE_WITH_APU
InitContext<TargetType::kAPU, APUContext>();
#endif
......
......@@ -33,6 +33,7 @@ lite_cc_library(mir_passes
elimination/identity_dropout_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc
elimination/remove_tf_redundant_ops_pass.cc
elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
type_target_cast_pass.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
#include <algorithm>
#include <list>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
// Remove all of the unused nodes from the contorl flow op and update the inputs
// and outputs of the op info The unused nodes are defined as the nodes which
// are only linked to the control flow op nodes but nerver linked to the other
// op nodes.
//
// For example:
// graph[0]: main block
// in_x
// in_f | in_z(unused node)
// \ | /
// \ | /
// in_w ------- while ------- in_y(unused_node)
// / |
// / |
// (unused node)out_y |
// out_x
//
// graph[1]: sub block
// in_x
// |
// |
// conv2d----in_f
// |
// |
// fc ------in_w
// |
// |
// softmax
// |
// |
// out_x
//
// After the pass is applied:
// in_x
// in_f |
// \ |
// \ |
// in_w ------- while
// |
// |
// |
// out_x
// Remove the var node from var2rm if it is recursively referred to any op in
// the subblock
void CollectUnusedInputOutputNodes(
int block_idx,
std::vector<std::unique_ptr<mir::SSAGraph>>* graphs,
const std::unordered_set<std::string>& control_flow_op_types,
std::unordered_map<std::string, Node*>* in_vars2rm,
std::unordered_map<std::string, Node*>* out_vars2rm) {
auto block_size = graphs->size();
for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) {
if (!op_node->IsStmt()) continue;
auto op_info = op_node->AsStmt().op_info();
auto op_type = op_info->Type();
if (control_flow_op_types.count(op_type)) {
int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
CHECK(block_idx >= 0 && block_idx < block_size);
CollectUnusedInputOutputNodes(sub_block_idx,
graphs,
control_flow_op_types,
in_vars2rm,
out_vars2rm);
} else {
for (auto& var_node : op_node->inlinks) {
auto& var_name = var_node->AsArg().name;
if (in_vars2rm->count(var_name)) {
in_vars2rm->erase(var_name);
}
}
for (auto& var_node : op_node->outlinks) {
auto& var_name = var_node->AsArg().name;
// Tensor array may be only used as the output vars in the sublock
if (in_vars2rm->count(var_name)) {
in_vars2rm->erase(var_name);
}
if (out_vars2rm->count(var_name)) {
out_vars2rm->erase(var_name);
}
}
}
}
}
// Remove the unused var nodes from the graph and update the op_info of the
// control flow op
void RemoveNodesFromGraphAndUpdateOpInfo(
SSAGraph* graph,
Node* op_node,
const std::unordered_map<std::string, Node*>& in_vars2rm,
const std::unordered_map<std::string, Node*>& out_vars2rm) {
auto op_info = op_node->AsStmt().mutable_op_info();
auto op_type = op_info->Type();
// Unlink the in_vars2rm and out_vars2rm from the control flow op node, and
// remove them if nerver used.
for (auto& var_node : in_vars2rm) {
VLOG(3) << "in var node '" << var_node.first << "' is unlinked to "
<< op_type;
RemoveDirectedLink(var_node.second, op_node);
}
for (auto& var_node : out_vars2rm) {
VLOG(3) << "out var node '" << var_node.first << "' is unlinked from "
<< op_type;
RemoveDirectedLink(op_node, var_node.second);
// Unlink from all of the out op nodes.
std::unordered_set<Node*> out_op_nodes;
for (auto* out_op_node : var_node.second->outlinks) {
if (!out_op_nodes.count(out_op_node)) {
out_op_nodes.insert(out_op_node);
}
}
for (auto* out_op_node : out_op_nodes) {
RemoveDirectedLink(var_node.second, out_op_node);
}
}
// Remove the unused nodes from the graph if their inlinks and outlinks are
// empty
std::unordered_set<const Node*> removed_var_nodes;
for (auto& var_node : in_vars2rm) {
if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
!removed_var_nodes.count(var_node.second)) {
removed_var_nodes.insert(var_node.second);
graph->RemoveNode(var_node.second);
VLOG(3) << "in var node " << var_node.first << " is removed";
}
}
for (auto& var_node : out_vars2rm) {
if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
!removed_var_nodes.count(var_node.second)) {
removed_var_nodes.insert(var_node.second);
graph->RemoveNode(var_node.second);
VLOG(3) << "out var node " << var_node.first << " is removed";
}
}
// Update the op info of the control flow op
for (auto& input : *op_info->mutable_inputs()) {
for (auto var = input.second.begin(); var != input.second.end();) {
if (in_vars2rm.count(*var)) {
var = input.second.erase(var);
} else {
++var;
}
}
}
for (auto& output : *op_info->mutable_outputs()) {
for (auto var = output.second.begin(); var != output.second.end();) {
if (out_vars2rm.count(*var)) {
var = output.second.erase(var);
} else {
++var;
}
}
}
}
void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs(
std::vector<std::unique_ptr<mir::SSAGraph>>* graphs) {
CHECK(graphs && !graphs->empty());
graphs_ = graphs;
}
void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
// Remove the unused input and output nodes from the control flow op nodes
// Which are only linked to the control flow op nodes but nerver linked to the
// other op nodes
const std::unordered_set<std::string> control_flow_op_types = {
"while", "conditional_block"};
auto block_size = graphs_->size();
for (auto& op_node : graph->StmtTopologicalOrder()) {
if (!op_node->IsStmt()) continue;
auto op_info = op_node->AsStmt().mutable_op_info();
auto op_type = op_info->Type();
if (!control_flow_op_types.count(op_type)) continue;
int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
CHECK(sub_block_idx >= 0 && sub_block_idx < block_size);
// Initialize the unused nodes with all of the input and output nodes
std::unordered_map<std::string, Node *> in_vars2rm, out_vars2rm;
for (auto* var_node : op_node->inlinks) {
auto& var_name = var_node->AsArg().name;
if (!in_vars2rm.count(var_name)) {
in_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
}
}
for (auto* var_node : op_node->outlinks) {
auto& var_name = var_node->AsArg().name;
if (!out_vars2rm.count(var_name)) {
out_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
}
}
// Remove the nodes which used in subblock recursively, and the remaining
// nodes are the unused one.
CollectUnusedInputOutputNodes(sub_block_idx,
graphs_,
control_flow_op_types,
&in_vars2rm,
&out_vars2rm);
if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) {
// Remove the unused nodes from graph, and update the op info of the
// control flow op
RemoveNodesFromGraphAndUpdateOpInfo(
graph.get(), op_node, in_vars2rm, out_vars2rm);
}
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(
control_flow_op_unused_inputs_and_outputs_eliminate_pass,
paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass)
.BindTargets({TARGET(kNPU)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <limits>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/mir/pass.h"
#include "lite/core/types.h"
namespace paddle {
namespace lite {
namespace mir {
class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass {
public:
void Apply(const std::unique_ptr<SSAGraph> &graph) override;
void SetAllGraphs(std::vector<std::unique_ptr<mir::SSAGraph>> *graphs);
private:
std::vector<std::unique_ptr<mir::SSAGraph>> *graphs_;
};
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -383,10 +383,10 @@ class XPUSingleEncoderFuser : public FuseBase {
op_desc.SetAttr<std::string>("act_type", act_type_);
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
auto* single_encoder_stmt = matched.at("q_mul")->stmt();
fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
......
......@@ -373,10 +373,10 @@ class XPUResNetCbamBlock0Fuser : public FuseBase {
auto block0_stmt = matched.at("left_conv1")->stmt();
// block0_stmt->ResetOp(op_desc, graph->valid_places());
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
block0_stmt->SetOp(fake_subgraph_op);
......@@ -693,10 +693,10 @@ class XPUResNetCbamBlock1Fuser : public FuseBase {
auto block1_stmt = matched.at("right_conv1")->stmt();
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
block1_stmt->SetOp(fake_subgraph_op);
......@@ -932,10 +932,10 @@ class XPUResNetCbamBlock2Fuser : public FuseBase {
<< "Y of last fc must have been transposed";
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, scope);
fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places());
block2_stmt->SetOp(fake_subgraph_op);
......
......@@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase {
auto block0_stmt = matched.at("left_conv1")->stmt();
// block0_stmt->ResetOp(op_desc, graph->valid_places());
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
block0_stmt->SetOp(fake_subgraph_op);
......@@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase {
auto block1_stmt = matched.at("right_conv1")->stmt();
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
block1_stmt->SetOp(fake_subgraph_op);
......
......@@ -34,20 +34,25 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
}
// fuse quantized node and dequant node
for (auto& op_type :
{"conv2d", "mul", "depthwise_conv2d", "conv2d_transpose"}) {
std::vector<std::string> quantized_op_types = {
"conv2d", "depthwise_conv2d", "conv2d_transpose", "mul"};
for (auto& op_type : quantized_op_types) {
fusion::DequantOpFuser fuser(op_type);
fuser(graph.get());
}
for (auto& op_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
for (auto& op_type : quantized_op_types) {
fusion::ChannelWiseDequantOpFuser fuser(op_type);
fuser(graph.get());
}
// process quant_dequant_node
fusion::DeleteQuantDequantOpFuser dqd_fuser;
dqd_fuser(graph.get());
std::vector<std::string> quant_dequant_op_types = {
"fake_quantize_dequantize_abs_max",
"fake_quantize_dequantize_moving_average_abs_max"};
for (auto& op_type : quant_dequant_op_types) {
fusion::DeleteQuantDequantOpFuser dqd_fuser(op_type);
dqd_fuser(graph.get());
}
}
} // namespace mir
......
......@@ -175,11 +175,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph,
for (int i = 0; i < weight_scale_size; i++) {
weight_scale.push_back(whole_weight_scale);
}
// Arm CPU does not support conv2d_transpose
if (quantized_op_type_ != "conv2d_transpose") {
op_desc.SetAttr("enable_int8", true);
}
op_desc.SetAttr("enable_int8", true);
op_desc.SetInputScale(weight_name, weight_scale);
// change the weight from the float type to int8 type.
......@@ -284,7 +280,6 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
op_desc.SetInput("X", {quantized_op_input->arg()->name});
op_desc.SetOutput("Out", {dequant_op_out->arg()->name});
}
// Arm CPU does not support conv2d_transpose
if (quantized_op_type_ != "conv2d_transpose") {
op_desc.SetAttr("enable_int8", true);
}
......@@ -320,30 +315,33 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
}
void DeleteQuantDequantOpFuser::BuildPattern() {
std::string quant_dequant_op_type =
"fake_quantize_dequantize_moving_average_abs_max";
auto* input_scale_node =
VarNode("input_scale_node")
->assert_is_op_input(quant_dequant_op_type, "InScale");
auto* input_act_node =
VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
->assert_is_op(quant_dequant_op_type);
auto* input_act_node = VarNode("input_act_node")
->assert_is_op_input(quant_dequant_op_type_, "X");
auto* quant_dequant_node =
OpNode("quant_dequant_node", quant_dequant_op_type_)
->assert_is_op(quant_dequant_op_type_);
auto* output_scale_node =
VarNode("output_scale_node")
->assert_is_op_output(quant_dequant_op_type, "OutScale");
->assert_is_op_output(quant_dequant_op_type_, "OutScale");
auto* output_act_node =
VarNode("output_act_node")
->assert_is_op_output(quant_dequant_op_type, "Out");
quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
->assert_is_op_output(quant_dequant_op_type_, "Out");
if (quant_dequant_op_type_ ==
"fake_quantize_dequantize_moving_average_abs_max") {
auto* input_scale_node =
VarNode("input_scale_node")
->assert_is_op_input(quant_dequant_op_type_, "InScale");
quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
} else {
quant_dequant_node->LinksFrom({input_act_node});
}
output_scale_node->LinksFrom({quant_dequant_node});
output_act_node->LinksFrom({quant_dequant_node});
}
void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto* input_scale_node = matched.at("input_scale_node");
auto* input_act_node = matched.at("input_act_node");
auto* quant_dequant_node = matched.at("quant_dequant_node");
auto* output_scale_node = matched.at("output_scale_node");
......@@ -373,7 +371,12 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
}
// delete nodes and edges
std::set<const Node*> nodes2rm = {
input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
quant_dequant_node, output_scale_node, output_act_node};
if (quant_dequant_op_type_ ==
"fake_quantize_dequantize_moving_average_abs_max") {
auto* input_scale_node = matched.at("input_scale_node");
nodes2rm.insert(input_scale_node);
}
GraphSafeRemoveNodes(graph, nodes2rm);
}
......
......@@ -86,17 +86,22 @@ class ChannelWiseDequantOpFuser : public FuseBase {
std::string quantized_op_type_{};
};
/* The pattern like "fake_quantize_dequantize_moving_average_abs_max +
* quantized_op" can be deteted by this fuser. The fuser modifies the input
* scale for the quantized_op and deletes the fake_quant_dequant_op.
/* The pattern like "fake_quantize_dequantize_op + quantized_op" can be
* deteted by this fuser. The fuser modifies the input scale for the
* quantized_op and deletes the fake_quant_dequant_op.
*/
class DeleteQuantDequantOpFuser : public FuseBase {
public:
explicit DeleteQuantDequantOpFuser(const std::string& quant_dequant_op_type)
: quant_dequant_op_type_(quant_dequant_op_type) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
private:
std::string quant_dequant_op_type_{};
};
} // namespace fusion
......
......@@ -39,6 +39,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
nodes_in_order = graph->StmtTopologicalOrder();
}
insts_.emplace_back();
for (auto& item : nodes_in_order) {
if (item->IsStmt()) {
auto& stmt = item->AsStmt();
......@@ -57,7 +58,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
.SetSyncStreams(stmt.sync_streams_);
}
#endif
insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
insts_.back().emplace_back(stmt.op(), std::move(stmt.kernels().front()));
}
}
}
......
......@@ -42,7 +42,7 @@ class GenerateProgramPass : public ProgramPass {
}
private:
std::vector<Instruction> insts_;
std::vector<std::vector<Instruction>> insts_;
};
} // namespace mir
......
......@@ -315,4 +315,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
TARGET(kBM),
TARGET(kRKNPU),
TARGET(kAPU),
TARGET(kMLU)});
TARGET(kMLU),
TARGET(kHuaweiAscendNPU)});
......@@ -284,13 +284,19 @@ void MLUPostprocessPass::InsertBefore(SSAGraph* graph,
head_node->AsArg().name,
cur_node->AsArg().name);
// for subgraph op, modify the BlockDesc
auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
inst_node->AsStmt().op().get())
->GetSubBlock();
for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
UpdateInputTo(
sub_block_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
inst_node->AsStmt().op().get())
->GetProgramDesc();
CHECK(sub_program_desc);
int sub_block_idx =
inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
auto* sub_block_desc =
sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
++sub_op_idx) {
auto* sub_op_desc = const_cast<cpp::OpDesc*>(
sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
UpdateInputTo(sub_op_desc, head_node->AsArg().name, cur_node->AsArg().name);
}
// recreate the op
......@@ -444,21 +450,27 @@ void MLUPostprocessPass::InsertAfter(SSAGraph* graph,
tail_node->AsArg().name,
cur_node->AsArg().name);
// for subgraph op, modify the BlockDesc
auto* sub_block_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
inst_node->AsStmt().op().get())
->GetSubBlock();
for (size_t i = 0; i < sub_block_desc->OpsSize(); ++i) {
auto* sub_block_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(i);
auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
inst_node->AsStmt().op().get())
->GetProgramDesc();
CHECK(sub_program_desc);
int sub_block_idx =
inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
auto* sub_block_desc =
sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
++sub_op_idx) {
auto* sub_op_desc = const_cast<cpp::OpDesc*>(
sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
UpdateOutputTo(
sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
/* graph like this
* subgraph_op_0
* / \
* / \
* subgraph_op_1 host_op
*/
UpdateInputTo(
sub_block_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
UpdateInputTo(sub_op_desc, tail_node->AsArg().name, cur_node->AsArg().name);
}
// recreate the op
......@@ -482,15 +494,22 @@ void MLUPostprocessPass::RecreateOp(Node* inst_node, SSAGraph* graph) {
}
}
bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
auto* block_desc =
static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
->GetSubBlock();
for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
CHECK(op_desc);
if (op_desc->Type() == "conv2d") {
for (auto& names : op_desc->inputs()) {
bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node,
Node* inst_node) {
auto sub_program_desc = dynamic_cast<paddle::lite::operators::SubgraphOp*>(
inst_node->AsStmt().op().get())
->GetProgramDesc();
CHECK(sub_program_desc);
int sub_block_idx =
inst_node->AsStmt().op()->op_info()->GetAttr<int32_t>("sub_block");
auto* sub_block_desc =
sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
sub_op_idx++) {
auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
CHECK(sub_op_desc);
if (sub_op_desc->Type() == "conv2d") {
for (auto& names : sub_op_desc->inputs()) {
if (std::find(names.second.begin(),
names.second.end(),
arg_node->AsArg().name) != names.second.end()) {
......@@ -746,19 +765,23 @@ std::pair<bool, std::string> CheckOutputAndInsert(
// insert cast op on mlu, to avoid cast on cpu
void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
const Type* subgraph_type) {
auto subgraph_op = subgraph_node->AsStmt().op();
CHECK_EQ(subgraph_op->Type(), "subgraph");
auto op = dynamic_cast<operators::SubgraphOp*>(subgraph_op.get());
CHECK(op);
auto block_desc = op->GetSubBlock();
CHECK_EQ(subgraph_node->AsStmt().op()->Type(), "subgraph");
auto subgraph_op =
dynamic_cast<operators::SubgraphOp*>(subgraph_node->AsStmt().op().get());
CHECK(subgraph_op);
auto sub_program_desc = subgraph_op->GetProgramDesc();
CHECK(sub_program_desc);
int sub_block_idx = subgraph_op->op_info()->GetAttr<int32_t>("sub_block");
auto* sub_block_desc = const_cast<cpp::BlockDesc*>(
sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx));
// create a new block desc to keep op sequence correct
cpp::BlockDesc* new_block_desc = new cpp::BlockDesc();
new_block_desc->ClearOps();
new_block_desc->ClearVars();
new_block_desc->SetIdx(block_desc->Idx());
new_block_desc->SetParentIdx(block_desc->ParentIdx());
new_block_desc->SetForwardBlockIdx(block_desc->ForwardBlockIdx());
cpp::BlockDesc new_block_desc;
new_block_desc.ClearOps();
new_block_desc.ClearVars();
new_block_desc.SetIdx(sub_block_desc->Idx());
new_block_desc.SetParentIdx(sub_block_desc->ParentIdx());
new_block_desc.SetForwardBlockIdx(sub_block_desc->ForwardBlockIdx());
// find all IO that is not weight or persist
std::list<std::string> i_names, o_names;
......@@ -769,8 +792,8 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
auto input_name = input->AsArg().name;
if (!(input->AsArg().is_weight || input->AsArg().is_persist)) {
i_names.emplace_back(input_name);
auto ret = CheckInputAndInsert(op->scope(),
new_block_desc,
auto ret = CheckInputAndInsert(subgraph_op->scope(),
&new_block_desc,
input_name,
input->AsArg().type,
subgraph_type);
......@@ -783,8 +806,8 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
auto output_name = output->AsArg().name;
if (!(output->AsArg().is_weight || output->AsArg().is_persist)) {
o_names.emplace_back(output_name);
auto ret = CheckOutputAndInsert(op->scope(),
block_desc,
auto ret = CheckOutputAndInsert(subgraph_op->scope(),
sub_block_desc,
output_name,
output->AsArg().type,
subgraph_type);
......@@ -795,46 +818,48 @@ void MLUPostprocessPass::AdjustSubgraph(Node* subgraph_node,
}
// update input and output
for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); ++op_idx) {
auto desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
auto new_desc = new_block_desc->AddOp<cpp::OpDesc>();
*new_desc = *desc;
if (desc->Type() != "layout" && desc->Type() != "cast") {
auto op_input_args = new_desc->InputArgumentNames();
for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
++sub_op_idx) {
auto sub_op_desc = sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx);
auto new_op_desc = new_block_desc.AddOp<cpp::OpDesc>();
*new_op_desc = *sub_op_desc;
if (sub_op_desc->Type() != "layout" && sub_op_desc->Type() != "cast") {
auto op_input_args = new_op_desc->InputArgumentNames();
for (auto& input_arg : op_input_args) {
auto op_input = new_desc->Input(input_arg);
auto op_input = new_op_desc->Input(input_arg);
for (auto& it : i_names) {
auto index = std::find(op_input.begin(), op_input.end(), it);
if (index != op_input.end() &&
node_replace.find(it) != node_replace.end()) {
index = op_input.erase(index);
op_input.emplace(index, node_replace.at(it));
VLOG(4) << new_desc->Type() << "] change input from " << it
VLOG(4) << new_op_desc->Type() << "] change input from " << it
<< " to " << node_replace.at(it);
}
}
new_desc->SetInput(input_arg, op_input);
new_op_desc->SetInput(input_arg, op_input);
}
auto op_output_args = new_desc->OutputArgumentNames();
auto op_output_args = new_op_desc->OutputArgumentNames();
for (auto& output_arg : op_output_args) {
auto op_output = new_desc->Output(output_arg);
auto op_output = new_op_desc->Output(output_arg);
for (auto& it : o_names) {
auto index = std::find(op_output.begin(), op_output.end(), it);
if (index != op_output.end() &&
node_replace.find(it) != node_replace.end()) {
index = op_output.erase(index);
op_output.emplace(index, node_replace.at(it));
VLOG(4) << new_desc->Type() << "] change output from " << it
VLOG(4) << new_op_desc->Type() << "] change output from " << it
<< " to " << node_replace.at(it);
}
}
new_desc->SetOutput(output_arg, op_output);
new_op_desc->SetOutput(output_arg, op_output);
}
}
}
op->SetSubBlock(new_block_desc);
*sub_block_desc = new_block_desc;
}
void ModifyValidPlaces(SSAGraph* graph, bool use_mlu_cast) {
......
......@@ -153,60 +153,61 @@ Node *SSAGraph::GraphCreateInstructNode(
}
void SSAGraph::Build(const Program &program,
const std::vector<Place> &valid_places) {
const std::vector<Place> &valid_places,
int block_idx) {
CHECK(node_storage_.empty());
auto weights_name = program.weights();
auto is_weights = [&](const std::string &name) -> bool {
auto it = std::find(weights_name.begin(), weights_name.end(), name);
if (it == weights_name.end()) return false;
auto weights = program.weights();
auto is_weight = [&](const std::string &name) -> bool {
auto it = std::find(weights.begin(), weights.end(), name);
if (it == weights.end()) return false;
return true;
};
std::map<std::string, PrecisionType> var_types = program.var_data_type();
std::map<std::string, mir::Node *> arg_update_node_map_;
for (auto &op : program.ops()) {
auto var_type_map = program.var_type_map();
std::map<std::string, mir::Node *> arg_update_node_map;
for (auto &op : program.ops(block_idx)) {
VLOG(3) << op->op_info()->Type();
auto *op_node = GraphCreateInstructNode(op, valid_places);
for (const std::string &name : op->op_info()->input_names()) {
auto *op_info = op->op_info();
const auto &op_type = op_info->Type();
for (const auto &var_name : op_info->input_names()) {
mir::Node *arg_node = nullptr;
if (arg_update_node_map_.count(name)) {
arg_node = arg_update_node_map_.at(name);
if (arg_update_node_map.count(var_name)) {
arg_node = arg_update_node_map.at(var_name);
} else {
node_storage_.emplace_back();
arg_node = &node_storage_.back();
arg_node->AsArg(name, node_storage_.size() - 1);
arg_update_node_map_[name] = arg_node;
arg_node->AsArg(var_name, node_storage_.size() - 1);
arg_update_node_map[var_name] = arg_node;
}
if (var_types.count(name)) {
if (var_type_map.count(var_name)) {
if (!arg_node->arg()->type) {
arg_node->arg()->type = LiteType::GetTensorTy(
TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
arg_node->arg()->type = var_type_map[var_name];
}
// Store the original data type of the output tensors for
// type_precision_cast_pass, to keep the consistency between the
// output types of original graph and optimized graph's
if (op->op_info()->Type() == "fetch") {
if (op_type == "fetch") {
op->mutable_op_info()->SetAttr<int>(
"data_type", static_cast<int>(var_types[name]));
"data_type",
static_cast<int>(var_type_map[var_name]->precision()));
}
}
if (is_weights(name)) arg_node->AsArg().is_weight = true;
if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
CHECK(arg_node->IsRoleSet());
DirectedLink(arg_node, op_node);
}
for (const std::string &name : op->op_info()->output_names()) {
for (const auto &var_name : op->op_info()->output_names()) {
node_storage_.emplace_back();
auto *arg_node = &node_storage_.back();
arg_node->AsArg(name, node_storage_.size() - 1);
arg_update_node_map_[name] = arg_node;
if (var_types.count(name) && !arg_node->arg()->type) {
arg_node->arg()->type = LiteType::GetTensorTy(
TARGET(kUnk), var_types[name], DATALAYOUT(kUnk));
arg_node->AsArg(var_name, node_storage_.size() - 1);
arg_update_node_map[var_name] = arg_node;
if (var_type_map.count(var_name) && !arg_node->arg()->type) {
arg_node->arg()->type = var_type_map[var_name];
}
if (is_weights(name)) arg_node->AsArg().is_weight = true;
if (is_weight(var_name)) arg_node->AsArg().is_weight = true;
CHECK(arg_node->IsRoleSet());
DirectedLink(op_node, arg_node);
}
......
......@@ -35,9 +35,13 @@ class GraphBase {};
class SSAGraph : GraphBase {
public:
// @param program: the op program
// @param program: the target program with vars and ops
// @param valid_places: the valid places user set for the system.
void Build(const Program &program, const std::vector<Place> &valid_places);
// @param block_idx: the block index in the target program, default is 0(main
// block)
void Build(const Program &program,
const std::vector<Place> &valid_places,
int block_idx = kRootBlockIdx);
void RemoveNode(const mir::Node *node);
std::vector<mir::Node *> StmtTopologicalOrder();
......
......@@ -411,16 +411,17 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
cpp::OpDesc subgraph_op_desc;
subgraph_op_desc.SetType("subgraph");
// Create a new sub block desc for storing all of Ops and Vars of the target
// subgraph and sub_block_idx is set as a attribute of subgraph op,
// sub_block_idx < 0 means it's a new subgraph op
int sub_block_idx = -(subgraph_idx + 1);
auto sub_block_desc = new cpp::BlockDesc();
// Create a program desc and a block desc for storing all of Ops and Vars of
// the target subgraph and sub_block_idx is set as a attribute of subgraph op,
// sub_block_idx = 0 means it's a new subgraph op
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
int sub_block_idx = 0;
auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
sub_block_desc->ClearOps();
sub_block_desc->ClearVars();
for (auto &op_node : subgraph_nodes) {
auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_block_op_desc = *op_node->AsStmt().op_info();
auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_op_desc = *op_node->AsStmt().op_info();
}
subgraph_op_desc.SetAttr<int32_t>("sub_block", sub_block_idx);
......@@ -437,13 +438,13 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
&local_var_nodes,
&unused_var_nodes);
// A simplified model without the original weight/local/unused nodes on the
// subgraph ops will be saved only if 'SUBGRAPH_DISABLE_ONLINE_MODE' is set to
// true and Predictor->Run(...), Predictor->Save(...) is called.
// subgraph ops will be saved only if 'SUBGRAPH_ONLINE_MODE' is set to
// true(default) and Predictor->Run(...), Predictor->Save(...) is called.
std::set<Node *> input_var_nodes(idata_var_nodes.begin(),
idata_var_nodes.end());
std::set<Node *> output_var_nodes(odata_var_nodes.begin(),
odata_var_nodes.end());
if (!GetBoolFromEnv(SUBGRAPH_DISABLE_ONLINE_MODE)) {
if (GetBoolFromEnv(SUBGRAPH_ONLINE_MODE, true)) {
input_var_nodes.insert(weight_var_nodes.begin(), weight_var_nodes.end());
output_var_nodes.insert(local_var_nodes.begin(), local_var_nodes.end());
output_var_nodes.insert(unused_var_nodes.begin(), unused_var_nodes.end());
......@@ -476,7 +477,7 @@ void SubgraphFuser::InsertNewNode(SSAGraph *graph,
subgraph_op_desc.SetOutput("Outputs", output_var_names);
auto subgraph_op = LiteOpRegistry::Global().Create("subgraph");
static_cast<operators::SubgraphOp *>(subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
auto any_op = (*subgraph_nodes.begin())->AsStmt().op();
subgraph_op->Attach(subgraph_op_desc, any_op->scope());
......
......@@ -141,12 +141,11 @@ std::vector<std::string> AddFetchDesc(
}
TEST(Subgraph, detect_simple_model) {
cpp::ProgramDesc program_desc;
auto program_desc = std::make_shared<cpp::ProgramDesc>();
std::vector<Place> valid_places{{TARGET(kHost), PRECISION(kFloat)}};
auto scope = std::make_shared<Scope>();
// Build a simple network
program_desc.ClearBlocks();
auto* block_desc = program_desc.AddBlock<cpp::BlockDesc>();
auto* block_desc = program_desc->AddBlock<cpp::BlockDesc>();
block_desc->ClearOps();
block_desc->ClearVars();
auto* var_desc = block_desc->AddVar<cpp::VarDesc>();
......@@ -181,13 +180,13 @@ TEST(Subgraph, detect_custom_model) {
"the path of model files.";
return;
}
cpp::ProgramDesc program_desc;
auto program_desc = std::make_shared<cpp::ProgramDesc>();
auto scope = std::make_shared<Scope>();
LoadModelPb(FLAGS_model_dir,
FLAGS_model_file,
FLAGS_params_file,
scope.get(),
&program_desc,
program_desc.get(),
!FLAGS_model_file.empty() && !FLAGS_params_file.empty(),
false);
std::vector<Place> valid_places({
......@@ -200,6 +199,9 @@ TEST(Subgraph, detect_custom_model) {
#ifdef LITE_WITH_NPU
Place{TARGET(kNPU), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)},
#endif
#ifdef LITE_WITH_XTCL
Place{TARGET(kXPU), PRECISION(kFloat)},
#endif
......
......@@ -40,6 +40,21 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser();
}
void HuaweiAscendNPUSubgraphPass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
std::set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) \
......@@ -119,6 +134,9 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
.BindTargets({TARGET(kNPU)});
REGISTER_MIR_PASS(huawei_ascend_npu_subgraph_pass,
paddle::lite::mir::HuaweiAscendNPUSubgraphPass)
.BindTargets({TARGET(kHuaweiAscendNPU)});
REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
.BindTargets({TARGET(kAPU)});
REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
......
......@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class HuaweiAscendNPUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class APUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
......
......@@ -187,6 +187,10 @@ TEST(Subgraph, generate_model_and_check_precision) {
#ifdef LITE_WITH_NPU
valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)});
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
valid_places.push_back(
lite_api::Place{TARGET(kHuaweiAscendNPU), PRECISION(kFloat)});
#endif
#ifdef LITE_WITH_XTCL
valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)});
#endif
......
......@@ -36,14 +36,20 @@ void UpdateInputsForSubgraph(OpLite* op,
op_desc->GetAttr<std::vector<std::string>>("input_data_names");
std::replace(input_data_names.begin(), input_data_names.end(), from, to);
op_desc->SetAttr("input_data_names", input_data_names);
auto* subblock_desc = static_cast<operators::SubgraphOp*>(op)->GetSubBlock();
CHECK(subblock_desc);
for (size_t i = 0; i < subblock_desc->OpsSize(); i++) {
auto* subblock_op_desc = subblock_desc->GetOp<cpp::OpDesc>(i);
for (auto& subblock_op_input : *subblock_op_desc->mutable_inputs()) {
for (auto& subblock_var_name : subblock_op_input.second) {
if (subblock_var_name == from) {
subblock_var_name = to;
auto sub_program_desc =
static_cast<operators::SubgraphOp*>(op)->GetProgramDesc();
CHECK(sub_program_desc);
int sub_block_idx = op_desc->GetAttr<int32_t>("sub_block");
auto sub_block_desc =
sub_program_desc->GetBlock<cpp::BlockDesc>(sub_block_idx);
for (size_t sub_op_idx = 0; sub_op_idx < sub_block_desc->OpsSize();
sub_op_idx++) {
auto sub_op_desc = const_cast<cpp::OpDesc*>(
sub_block_desc->GetOp<cpp::OpDesc>(sub_op_idx));
for (auto& sub_op_input : *sub_op_desc->mutable_inputs()) {
for (auto& sub_var_name : sub_op_input.second) {
if (sub_var_name == from) {
sub_var_name = to;
}
}
}
......
......@@ -59,25 +59,46 @@ class VariablePlaceInferencePass : public DebugPass {
}
// Set the type of the weight
void SetWeightType(Node* w,
void SetWeightType(Node* weight_node,
const LiteType& type,
const std::map<std::string, bool>& lite_with_targets) {
const std::map<std::string, bool>& with_targets) {
VLOG(4) << "type.precision():" << PrecisionRepr(type.precision());
if (lite_with_targets.at("kFPGA")) {
w->AsArg().type = LiteType::GetTensorTy(
if (with_targets.at("kFPGA")) {
weight_node->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
} else if (lite_with_targets.at("kOpenCL")) {
w->AsArg().type = LiteType::GetTensorTy(
} else if (with_targets.at("kOpenCL")) {
weight_node->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
} else if (lite_with_targets.at("kCUDA")) {
w->AsArg().type = LiteType::GetTensorTy(
} else if (with_targets.at("kCUDA")) {
weight_node->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
} else {
w->AsArg().type = LiteType::GetTensorTy(
weight_node->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
}
}
// Update a's kUnk fields from b's fields.
void UpdateTypeFrom(const Type** a, const Type* b) {
auto target = (*a)->target();
auto precision = (*a)->precision();
auto layout = (*a)->layout();
if (target == TARGET(kUnk)) {
target = b->target();
}
if (precision == PRECISION(kUnk)) {
precision = b->precision();
}
if (layout == DATALAYOUT(kUnk)) {
layout = b->layout();
}
if ((*a)->IsTensor() && b->IsTensor()) {
*a = LiteType::GetTensorTy(target, precision, layout);
} else if ((*a)->IsTensorList() && b->IsTensorList()) {
*a = LiteType::GetTensorListTy(target, precision, layout);
}
}
void InferenceArgumentPlace(SSAGraph* graph) {
auto& valid_places = graph->valid_places();
auto valid_places_has_target = [&](TargetType t) -> bool {
......@@ -88,122 +109,90 @@ class VariablePlaceInferencePass : public DebugPass {
}
return false;
};
std::map<std::string, bool> lite_with_targets{
std::map<std::string, bool> with_targets{
{"kOpenCL", valid_places_has_target(TARGET(kOpenCL))},
{"kCUDA", valid_places_has_target(TARGET(kCUDA))},
{"kFPGA", valid_places_has_target(TARGET(kFPGA))}};
VLOG(4) << "lite_with_targets['kOpenCL']:" << lite_with_targets["kOpenCL"];
VLOG(4) << "lite_with_targets['kFPGA']:" << lite_with_targets["kFPGA"];
VLOG(4) << "with_targets['kOpenCL']:" << with_targets["kOpenCL"];
VLOG(4) << "with_targets['kFPGA']:" << with_targets["kFPGA"];
VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
for (auto& x : graph->StmtTopologicalOrder()) {
auto& inst = x->AsStmt();
for (auto& node : graph->StmtTopologicalOrder()) {
auto& inst = node->AsStmt();
const auto* op_info = inst.op_info();
const auto& op_type = op_info->Type();
auto& kernel = inst.picked_kernel();
// The IoCopyOp is a tool operator, it won't support the type inference.
// in fpga, we has io_copy+cali+layout tool ops, so we need type inference
// for
// tool operator
if ((!lite_with_targets["kFPGA"]) && (!lite_with_targets["kOpenCL"])) {
VLOG(3) << "inst.op_type() == 'io_copy', continue";
if (inst.op_type() == "io_copy") continue;
// for tool operator
if ((!with_targets["kFPGA"]) && (!with_targets["kOpenCL"])) {
VLOG(3) << "skip 'io_copy' if target is FPGA and OpenCL";
if (op_type == "io_copy") continue;
}
// deal with inputs
VLOG(4) << "Infering op " << inst.op_info()->Repr();
// TODO(zhaolong): Add check if the node's name in op's arguments.
auto get_argname = [&](
const std::string& node_name,
const std::map<std::string, std::vector<std::string>>& argname_map)
-> std::string {
for (auto& ele : argname_map) {
auto it =
std::find(ele.second.begin(), ele.second.end(), node_name);
if (it != ele.second.end()) return ele.first;
}
return "";
};
for (auto* x_in : x->inlinks) {
std::string node_name = x_in->AsArg().name;
std::string arg_name = get_argname(node_name, inst.op_info()->inputs());
CHECK(arg_name.size() > 0) << "can not found op arguments for node "
<< node_name;
VLOG(4) << "-- input arg_name:" << arg_name << " "
<< "-- node name:" << node_name;
auto type = inst.picked_kernel().GetInputDeclType(arg_name);
if (!x_in->AsArg().type) {
VLOG(4) << "set type " << *type << " " << x_in->AsArg().name;
if (x_in->AsArg().is_weight) {
SetWeightType(x_in, *type, lite_with_targets);
// Infering the input and output variable's place according to the
// declaration of I/O arguments of the picked kernel of the op
VLOG(4) << "Op " << op_info->Repr();
for (auto* in_node : node->inlinks) {
auto& var = in_node->AsArg();
const auto& var_name = var.name;
auto* var_type = &var.type;
std::string arg_name;
CHECK(op_info->GetInputArgname(var_name, &arg_name))
<< "Can not find the input argument for var " << var_name;
VLOG(4) << " - input arg name:" << arg_name << " var name:" << var_name;
const auto* decl_type = kernel.GetInputDeclType(arg_name);
if (!(*var_type)) {
VLOG(4) << "set type " << *decl_type << " " << var_name;
if (var.is_weight) {
SetWeightType(in_node, *decl_type, with_targets);
} else {
x_in->AsArg().type = type;
*var_type = decl_type;
}
} else if (x_in->AsArg().type->target() == TARGET(kUnk) &&
x_in->AsArg().type->precision() != PRECISION(kUnk) &&
x_in->AsArg().type->layout() == DATALAYOUT(kUnk)) {
} else if (!(*var_type)->place().is_valid()) {
// If is quantization, infer the Int8 type.
if (type->precision() == PRECISION(kInt8)) {
x_in->AsArg().type = type;
if (decl_type->precision() == PRECISION(kInt8)) {
*var_type = decl_type;
} else {
PrecisionType tmp_ptype = x_in->AsArg().type->precision();
x_in->AsArg().type = LiteType::GetTensorTy(
type->target(), tmp_ptype, type->layout());
UpdateTypeFrom(var_type, decl_type);
}
}
}
VLOG(4) << "inst " << inst.op_info()->Repr();
for (auto* x_out : x->outlinks) {
std::string node_name = x_out->AsArg().name;
std::string arg_name =
get_argname(node_name, inst.op_info()->outputs());
CHECK(arg_name.size() > 0) << "can not found op arguments for node "
<< node_name << " in Inst "
<< inst.op_type();
VLOG(4) << "-- output arg_name " << arg_name;
auto type = inst.picked_kernel().GetOutputDeclType(arg_name);
if (!x_out->AsArg().type) {
VLOG(4) << "set type " << *type << " " << x_out->AsArg().name;
if (x_out->AsArg().is_weight) {
SetWeightType(x_out, *type, lite_with_targets);
for (auto* out_node : node->outlinks) {
auto& var = out_node->AsArg();
const auto& var_name = var.name;
auto* var_type = &var.type;
std::string arg_name;
CHECK(op_info->GetOutputArgname(var_name, &arg_name))
<< "Can not find the output argument for var " << var_name;
VLOG(4) << " - output arg name:" << arg_name
<< " var name:" << var_name;
const auto* decl_type = kernel.GetOutputDeclType(arg_name);
if (!(*var_type)) {
VLOG(4) << "set type " << *decl_type << " " << var_name;
if (var.is_weight) {
SetWeightType(out_node, *decl_type, with_targets);
} else {
x_out->AsArg().type = type;
*var_type = decl_type;
}
} else if (x_out->AsArg().type->target() == TARGET(kUnk) &&
x_out->AsArg().type->precision() != PRECISION(kUnk) &&
x_out->AsArg().type->layout() == DATALAYOUT(kUnk)) {
} else if (!(*var_type)->place().is_valid()) {
// If is quantization, infer the Int8 type.
if (type->precision() == PRECISION(kInt8)) {
x_out->AsArg().type = type;
} else if (type->precision() == PRECISION(kFP16) &&
type->target() != TARGET(kOpenCL)) {
x_out->AsArg().type = type;
if (decl_type->precision() == PRECISION(kInt8) ||
(decl_type->precision() == PRECISION(kFP16) &&
decl_type->target() != TARGET(kOpenCL))) {
*var_type = decl_type;
} else {
PrecisionType tmp_ptype = x_out->AsArg().type->precision();
x_out->AsArg().type = LiteType::GetTensorTy(
type->target(), tmp_ptype, type->layout());
UpdateTypeFrom(var_type, decl_type);
}
}
}
}
}
// Update me's kUnk fields by other's fields.
void UpdatePlace(Place* me, const Place& other) {
CHECK(other.is_valid());
if (me->target == TARGET(kUnk)) {
me->target = other.target;
}
if (me->precision == PRECISION(kUnk)) {
me->precision = other.precision;
}
if (me->layout == DATALAYOUT(kUnk)) {
me->layout = other.layout;
}
}
private:
// The default target for arguments, e.g. load weights to CPU memory for CUDA
// computation by default.
// The default target for arguments, e.g. load weights to CPU memory for
// CUDA computation by default.
TargetType argument_default_target_{TARGET(kHost)};
};
......
......@@ -99,7 +99,7 @@ class OpLite : public Registry {
std::vector<std::unique_ptr<KernelBase>> CreateKernels(
const std::vector<Place> &places, const std::string &kernel_type = "");
lite::Scope *scope() { return scope_; }
Scope *scope() { return scope_; }
// Assign op param to kernel.
virtual void AttachKernel(KernelBase *kernel) = 0;
......@@ -169,7 +169,7 @@ class OpLite : public Registry {
}
protected:
lite::Scope *scope_{nullptr};
Scope *scope_{nullptr};
std::unique_ptr<KernelBase> kernel_;
std::string op_type_;
std::vector<Place> valid_places_;
......
......@@ -19,6 +19,7 @@
#include <string>
#include <utility>
#include <vector>
#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
#include "lite/core/mir/generate_program_pass.h"
#include "lite/core/mir/pass_manager.h"
#include "lite/core/mir/pass_utils.h"
......@@ -36,6 +37,9 @@ namespace lite {
* lite::Optimizer optimize a program. It utilize the mir passes to analysis the
* program and export an optimized program.
*/
// TODO(hong1986032) Support the following passes for the subblocks
const std::set<std::string> kSubblockUnsupportedPasses(
{"memory_optimize_pass"});
class Optimizer {
public:
Optimizer() {}
......@@ -60,14 +64,20 @@ class Optimizer {
program_ = &program;
valid_places_ = valid_places;
CHECK(!valid_places.empty()) << "At least one valid_place should be set";
CHECK(!graph_) << "duplicate optimize found";
graph_.reset(new mir::SSAGraph);
graph_->Build(program, valid_places);
graph_->SetValidPlaces(valid_places);
CHECK(graphs_.empty()) << "duplicate optimize found";
auto block_size = program.block_size();
for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
std::unique_ptr<mir::SSAGraph> graph;
graph.reset(new mir::SSAGraph);
graph->Build(program, valid_places, block_idx);
graph->SetValidPlaces(valid_places);
graphs_.emplace_back(std::move(graph));
}
SpecifyKernelPickTactic(kernel_pick_factor);
InitTargetTypeTransformPass();
InitControlFlowOpUnusedInputsAndOutputsEliminatePass();
if (passes.empty() || passes.size() == 1) {
std::vector<std::string> passes_local{
......@@ -107,11 +117,13 @@ class Optimizer {
// 'enable_int8' for all
// of the quantized ops.
"npu_subgraph_pass",
"huawei_ascend_npu_subgraph_pass",
"xpu_subgraph_pass",
"bm_subgraph_pass",
"apu_subgraph_pass",
"rknpu_subgraph_pass",
"mlu_subgraph_pass",
"control_flow_op_unused_inputs_and_outputs_eliminate_pass",
"static_kernel_pick_pass", // pick original kernel from graph
"remove_tf_redundant_ops_pass",
......@@ -176,62 +188,15 @@ class Optimizer {
exec_scope_ = program.exec_scope();
}
const lite::Scope* exec_scope() const { return exec_scope_; }
// Set shape(dims) infos of var descs to scope var.
// developer can write pass using input / output tensor dims of op.
//
// Example: If you have node `Node* softmax_node`,
// you can get dims of output tensor in passes:
//
// auto* scope = softmax_node->AsStmt().op()->scope();
// auto softmax_out_arg_name =
// softmax_node->outlinks.front()->AsArg().name;
// auto softmax_out_tensor =
// scope->FindVar(softmax_out_arg_name)->Get<lite::Tensor>();
// softmax_out_dims = softmax_out_tensor.dims();
void SetVarDescShapeToScopeVar() {
auto dims_to_str_func = [](std::vector<int64_t> shape) -> std::string {
std::string str_res;
for (size_t i = 0; i < shape.size(); ++i) {
str_res += std::to_string(shape[i]);
if (i != shape.size() - 1) {
str_res += "x";
}
}
return str_res;
};
auto* program_desc = program_->program_desc();
VLOG(5) << "program_desc->BlocksSize():" << program_desc->BlocksSize();
auto blocks_desc = program_desc->GetBlocks();
for (size_t bidx = 0; bidx < blocks_desc.size(); ++bidx) {
auto block_desc = blocks_desc[bidx];
auto vars_desc = block_desc.GetVars();
for (size_t vidx = 0; vidx < vars_desc.size(); ++vidx) {
auto var_desc = vars_desc[vidx];
VLOG(5) << var_desc.Name() << " "
<< dims_to_str_func(var_desc.GetShape());
if (var_desc.Name() == "feed" || var_desc.Name() == "fetch") continue;
auto* var = program_->exec_scope()->FindVar(var_desc.Name());
auto tensor = var->GetMutable<lite::Tensor>();
if (tensor->dims().size() == 0 && var_desc.GetShape().size() != 0) {
VLOG(5) << "var_desc.Name():" << var_desc.Name()
<< " shape:" << dims_to_str_func(var_desc.GetShape());
tensor->Resize(var_desc.GetShape());
}
VLOG(5) << "var_desc.Name():" << var_desc.Name()
<< " shape:" << dims_to_str_func(var_desc.GetShape())
<< " tensor:" << tensor->dims();
}
}
}
const Scope* exec_scope() const { return exec_scope_; }
// Generate a new program based on the mir graph.
std::unique_ptr<RuntimeProgram> GenRuntimeProgram() {
auto pass = mir::PassManager::Global().LookUp<mir::GenerateProgramPass>(
"generate_program_pass");
pass->Apply(graph_);
for (auto& graph : graphs_) {
pass->Apply(graph);
}
auto program = pass->GenProgram();
CHECK(exec_scope_);
program->set_exec_scope(exec_scope_);
......@@ -247,27 +212,38 @@ class Optimizer {
pass->SetValidPlaces(valid_places_);
}
void InitControlFlowOpUnusedInputsAndOutputsEliminatePass() {
auto* pass =
mir::PassManager::Global()
.LookUp<mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass>(
"control_flow_op_unused_inputs_and_outputs_eliminate_pass");
CHECK(pass);
CHECK(!graphs_.empty());
pass->SetAllGraphs(&graphs_);
}
// Generate C++ code which combines the inference program, model and weights.
void GenCode(const std::string& code_dir);
const mir::SSAGraph& ssa_graph() const {
CHECK(graph_);
return *graph_;
const mir::SSAGraph& ssa_graph(int block_idx = kRootBlockIdx) const {
CHECK(!graphs_.empty());
CHECK(graphs_[block_idx]);
return *graphs_[block_idx];
}
mir::SSAGraph* mutable_ssa_graph() {
CHECK(graph_);
return graph_.get();
mir::SSAGraph* mutable_ssa_graph(int block_idx = kRootBlockIdx) {
CHECK(!graphs_.empty());
CHECK(graphs_[block_idx]);
return graphs_[block_idx].get();
}
lite::Scope* exec_scope() { return exec_scope_; }
Scope* exec_scope() { return exec_scope_; }
protected:
void SpecifyKernelPickTactic(core::KernelPickFactor factor);
// Specify the passes and run them.
void RunPasses(const std::vector<std::string>& passes) {
SetVarDescShapeToScopeVar();
for (auto& x : passes) {
LOG(INFO) << "== Running pass: " << x;
mir::Pass* pass = mir::PassManager::Global().LookUp(x);
......@@ -285,16 +261,23 @@ class Optimizer {
LOG(INFO) << " - Skip " << x
<< " because the target or kernel does not match.";
} else {
pass->Apply(graph_);
// Check the pass whether it is supported for processing subblocks
if (kSubblockUnsupportedPasses.count(x)) {
pass->Apply(graphs_[kRootBlockIdx]);
} else {
for (auto& graph : graphs_) {
pass->Apply(graph);
}
}
LOG(INFO) << "== Finished running: " << x;
}
}
}
private:
std::unique_ptr<mir::SSAGraph> graph_;
std::vector<std::unique_ptr<mir::SSAGraph>> graphs_;
std::vector<Place> valid_places_;
lite::Scope* exec_scope_{};
Scope* exec_scope_{};
Program* program_{};
};
......
此差异已折叠。
......@@ -41,61 +41,72 @@ static const char kKernelTypeAttr[] = "__@kernel_type_attr@__";
// - scope: which contains all the weights
struct Program {
public:
explicit Program(const std::shared_ptr<Scope>& root) { scope_ = root; }
Program(const cpp::ProgramDesc& desc,
const std::shared_ptr<Scope>& root,
explicit Program(const std::shared_ptr<Scope>& root_scope) {
scope_ = root_scope;
}
Program(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::shared_ptr<Scope>& root_scope,
const std::vector<Place>& valid_places,
const std::vector<std::string>& var_names = {})
: scope_(root), valid_places_(valid_places) {
desc_.CopyFrom(desc);
const std::vector<std::string>& vars_to_clone = {})
: scope_(root_scope),
valid_places_(valid_places),
program_desc_(program_desc) {
CHECK(scope_) << "scope should be init first";
VLOG(4) << "prepare work";
PrepareWorkspace(desc, var_names);
PrepareWorkspace(program_desc_, vars_to_clone);
VLOG(4) << "build desc";
Build(desc);
Build(program_desc_);
VLOG(4) << "build desc finished";
}
std::unique_ptr<Program> Clone() const {
std::unique_ptr<Program> res(new Program(desc_, scope_, valid_places_));
return res;
return std::unique_ptr<Program>(
new Program(program_desc_, scope_, valid_places_));
}
const std::list<std::string>& weights() const { return weights_; }
const std::list<std::string>& tmp_vars() const { return tmp_vars_; }
const std::list<std::string>& vars() const { return vars_; }
std::list<std::string>* mutable_weights() { return &weights_; }
std::list<std::string>* mutable_tmp_vars() { return &tmp_vars_; }
std::list<std::string>* mutable_vars() { return &vars_; }
const std::list<std::shared_ptr<OpLite>>& ops() const { return ops_; }
std::list<std::shared_ptr<OpLite>>* mutable_ops() { return &ops_; }
const std::list<std::shared_ptr<OpLite>>& ops(
int block_idx = kRootBlockIdx) const {
return ops_[block_idx];
}
std::list<std::shared_ptr<OpLite>>* mutable_ops(
int block_idx = kRootBlockIdx) {
return &ops_[block_idx];
}
lite::Scope* exec_scope() { return exec_scope_; }
lite::Scope* scope() { return scope_.get(); }
size_t block_size() { return ops_.size(); }
cpp::ProgramDesc* program_desc() { return &desc_; }
Scope* exec_scope() { return exec_scope_; }
Scope* scope() { return scope_.get(); }
const std::map<std::string, PrecisionType>& var_data_type() const {
return var_data_type_;
cpp::ProgramDesc* program_desc() { return program_desc_.get(); }
const std::map<std::string, const Type*>& var_type_map() const {
return var_type_map_;
}
private:
// Build from a program and scope.
void Build(const cpp::ProgramDesc& program);
void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc);
// Create temporary variables.
void PrepareWorkspace(const cpp::ProgramDesc& program,
const std::vector<std::string>& var_names = {});
void PrepareWorkspace(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::vector<std::string>& vars_to_clone = {});
private:
std::map<std::string, PrecisionType> var_data_type_;
std::list<std::string> tmp_vars_;
std::map<std::string, const Type*> var_type_map_;
std::list<std::string> vars_;
std::list<std::string> weights_;
std::list<std::shared_ptr<OpLite>> ops_;
std::vector<std::list<std::shared_ptr<OpLite>>> ops_;
// the scope to run the kernels, NOTE this is the execution scope.
std::shared_ptr<lite::Scope> scope_;
std::shared_ptr<Scope> scope_;
std::vector<Place> valid_places_;
// Runtime scope.
lite::Scope* exec_scope_{};
cpp::ProgramDesc desc_;
Scope* exec_scope_{};
std::shared_ptr<cpp::ProgramDesc> program_desc_;
};
struct Instruction {
......@@ -173,8 +184,22 @@ struct Instruction {
*/
class LITE_API RuntimeProgram {
public:
explicit RuntimeProgram(std::vector<Instruction>&& insts)
explicit RuntimeProgram(std::vector<std::vector<Instruction>>&& insts)
: instructions_(std::move(insts)) {
Init();
}
explicit RuntimeProgram(
const std::shared_ptr<const cpp::ProgramDesc>& program_desc,
Scope* exec_scope,
int block_idx = kRootBlockIdx);
~RuntimeProgram() {
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
#endif // LITE_WITH_PROFILE
}
void Init() {
if (instructions_.empty()) {
LOG(FATAL) << "no instructions";
}
......@@ -183,7 +208,7 @@ class LITE_API RuntimeProgram {
#endif
#ifdef LITE_WITH_NVTX
const NVTXAnnotator& annotator = NVTXAnnotator::Global();
for (auto& inst : instructions_) {
for (auto& inst : instructions_[kRootBlockIdx]) {
NVTXRangeAnnotation annotation = annotator.AnnotateBlock();
register_layer_names_.push_back(annotator.RegisterString(
const_cast<paddle::lite::OpLite*>(inst.op())->Type().c_str()));
......@@ -191,41 +216,38 @@ class LITE_API RuntimeProgram {
register_layer_names_.push_back(annotator.RegisterString("one_loop"));
#endif
}
~RuntimeProgram() {
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kCreate);
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch);
#endif // LITE_WITH_PROFILE
}
void Run();
void set_exec_scope(lite::Scope* x) { exec_scope_ = x; }
lite::Scope* exec_scope() { return exec_scope_; }
void set_exec_scope(Scope* x) { exec_scope_ = x; }
Scope* exec_scope() { return exec_scope_; }
size_t num_instructions() const { return instructions_.size(); }
const std::vector<Instruction>& instructions(
int block_idx = kRootBlockIdx) const {
return instructions_[block_idx];
}
const std::vector<Instruction>& instructions() const { return instructions_; }
std::vector<Instruction>* mutable_instructions(
int block_idx = kRootBlockIdx) {
return &instructions_[block_idx];
}
// `SaveOpInfosToProgram` will update the op list(ops_) of the block 0
// in ProgramDesc.
void SaveOpInfosToProgram(cpp::ProgramDesc* desc);
size_t block_size() { return instructions_.size(); }
// `UpdateVarsOfProgram` will update the var list(vars_) of the block 0 in
// ProgramDesc. Namely, if a new var created in some passes, its var_desc will
// be added in vars_.
void UpdateVarsOfProgram(cpp::ProgramDesc* desc);
// Update the ops and vars of all of blocks to the given program_desc
// according to the instructions
void SaveToProgram(std::shared_ptr<cpp::ProgramDesc> program_desc);
private:
RuntimeProgram(const RuntimeProgram&) = delete;
std::vector<Instruction> instructions_;
lite::Scope* exec_scope_{};
std::vector<std::vector<Instruction>> instructions_;
Scope* exec_scope_{};
#ifdef LITE_WITH_PROFILE
profile::Profiler profiler_;
void set_profiler() {
for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
i->set_profiler(&profiler_);
for (auto& inst : instructions_[kRootBlockIdx]) {
inst.set_profiler(&profiler_);
}
}
#endif
......
......@@ -84,6 +84,7 @@ void TensorLite::CopyDataFrom(const TensorLite &other) {
lod_ = other.lod_;
memory_size_ = other.memory_size_;
precision_ = other.precision_;
persistable_ = other.persistable_;
buffer_->CopyDataFrom(*other.buffer_, memory_size_);
}
......
......@@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc
X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
RKNPU_DEPS ${rknpu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
......@@ -44,6 +45,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co
X86_DEPS ${x86_kernels}
ARM_DEPS ${arm_kernels}
NPU_DEPS ${npu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
RKNPU_DEPS ${rknpu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
......
......@@ -14,3 +14,4 @@ add_subdirectory(mlu)
add_subdirectory(apu)
add_subdirectory(bm)
add_subdirectory(rknpu)
add_subdirectory(huawei_ascend_npu)
......@@ -37,7 +37,7 @@ bool SubgraphEngine::BuildDeviceProgram() {
subgraph::apu::Graph graph;
int neuron_errCode = NeuronModel_create(&model_);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Fail to create model";
LOG(WARNING) << "[APU] Failed to create the neuron model!";
return false;
}
graph.set_model(model_);
......@@ -46,11 +46,12 @@ bool SubgraphEngine::BuildDeviceProgram() {
// Convert all of ops and their input vars and weights and added into the APU
// NIR graph
if (origin_program_.empty()) {
if (!origin_program_) {
BuildOriginProgram();
}
const auto& bridges = subgraph::Registry::Instance();
for (auto& inst : origin_program_) {
const auto& insts = origin_program_->instructions(kRootBlockIdx);
for (auto& inst : insts) {
auto op = const_cast<OpLite*>(inst.op());
CHECK(op);
op->CheckShape();
......@@ -70,55 +71,38 @@ bool SubgraphEngine::BuildDeviceProgram() {
}
}
// Get input tensor
std::vector<uint32_t> ins;
origin_itensors_.resize(input_names_.size());
origin_idims_.resize(input_names_.size());
// Get the index of input tensors
std::vector<uint32_t> input_indices;
for (int i = 0; i < input_names_.size(); i++) {
origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]);
CHECK(origin_itensors_[i]);
origin_idims_[i] = origin_itensors_[i]->dims();
VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":"
<< origin_idims_[i].production();
// Get input index
int idx;
if (graph.Has(input_names_[i])) {
ins.push_back(graph.Get(input_names_[i])->index());
VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index();
} else {
LOG(WARNING) << "Fail to find input: " << input_names_[i];
return false;
}
CHECK(graph.Has(input_names_[i])) << "[APU] Failed to find input node "
<< input_names_[i];
auto index = graph.Get(input_names_[i])->index();
input_indices.push_back(index);
VLOG(3) << "[APU] Input[" << i << "] name " << input_names_[i] << " dims "
<< origin_itensors_[i]->dims() << " index " << index;
}
// Get output tensor
std::vector<uint32_t> outs;
origin_otensors_.resize(output_names_.size());
origin_odims_.resize(output_names_.size());
// Get the index of output tensors
std::vector<uint32_t> output_indices;
for (int i = 0; i < output_names_.size(); i++) {
origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]);
CHECK(origin_otensors_[i]);
origin_odims_[i] = origin_otensors_[i]->dims();
VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":"
<< origin_odims_[i].production();
CHECK(graph.Has(output_names_[i])) << "[APU] Failed to find output node "
<< output_names_[i];
origin_otensors_[i]->mutable_data<int8_t>();
// Get input index
if (graph.Has(output_names_[i])) {
outs.push_back(graph.Get(output_names_[i])->index());
VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index();
} else {
LOG(WARNING) << "Fail to find output: " << output_names_[i];
return false;
}
auto index = graph.Get(output_names_[i])->index();
output_indices.push_back(index);
VLOG(3) << "[APU] Output[" << i << "] name " << output_names_[i] << " dims "
<< origin_otensors_[i]->dims() << " index " << index;
}
VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size();
// Set subgraph input/output
NeuronModel_identifyInputsAndOutputs(
model_, ins.size(), &ins[0], outs.size(), &outs[0]);
// Indentify the input and output tensors of the neuron model
NeuronModel_identifyInputsAndOutputs(model_,
input_indices.size(),
&input_indices[0],
output_indices.size(),
&output_indices[0]);
neuron_errCode = NeuronModel_finish(model_);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode;
LOG(WARNING) << "[APU] Fail to create NIR model:" << neuron_errCode;
return false;
}
VLOG(3) << "[APU] APU NIR model created!";
......@@ -207,11 +191,11 @@ SubgraphEngine::~SubgraphEngine() {
void SubgraphCompute::PrepareForRun() {
auto& param = this->Param<param_t>();
engine_.reset(new SubgraphEngine(ctx_.get(),
param.sub_block_idx,
param.sub_block_desc,
param.block_idx,
param.program_desc,
param.exec_scope,
param.input_data_names,
param.output_data_names,
param.scope));
param.output_data_names));
CHECK(engine_);
}
......
......@@ -31,12 +31,16 @@ class SubgraphEngine : public subgraph::Engine {
public:
SubgraphEngine(KernelContext *ctx,
int block_idx,
cpp::BlockDesc *block_desc,
const std::shared_ptr<const cpp::ProgramDesc> &program_desc,
Scope *exec_scope,
const std::vector<std::string> &input_names,
const std::vector<std::string> &output_names,
Scope *scope)
: subgraph::Engine(
ctx, block_idx, block_desc, input_names, output_names, scope) {}
const std::vector<std::string> &output_names)
: subgraph::Engine(ctx,
block_idx,
program_desc,
exec_scope,
input_names,
output_names) {}
~SubgraphEngine();
......
......@@ -75,7 +75,6 @@ add_kernel(generate_proposals_compute_arm ARM extra SRCS generate_proposals_comp
add_kernel(roi_align_compute_arm ARM extra SRCS roi_align_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(box_clip_compute_arm ARM extra SRCS box_clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(assign_value_compute_arm ARM basic SRCS assign_value_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(conditional_block_compute_arm ARM extra SRCS conditional_block_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(collect_fpn_proposals_compute_arm ARM extra SRCS collect_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(distribute_fpn_proposals_compute_arm ARM extra SRCS distribute_fpn_proposals_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(clip_compute_arm ARM extra SRCS clip_compute.cc DEPS ${lite_kernel_deps} math_arm)
......@@ -87,7 +86,6 @@ add_kernel(beam_search_decode_compute_arm ARM extra SRCS beam_search_decode_comp
add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${lite_kernel_deps} math_arm)
......
......@@ -202,17 +202,13 @@ void ElementwiseMulCompute<T, PType>::Run() {
}
}
template <>
void ElementwiseMulCompute<int64_t, PRECISION(kInt64)>::Run() {
auto& param = this->template Param<operators::ElementwiseParam>();
lite::arm::math::elementwise_compute_basic<int64_t>(param, "mul", "");
}
void ElementwiseMulActivationCompute::Run() {
auto& param = Param<operators::FusionElementwiseActivationParam>();
const float* x_data = param.X->data<float>();
const float* y_data = param.Y->data<float>();
float* out_data = param.Out->mutable_data<float>();
template <typename T, PrecisionType PType>
void ElementwiseMulActivationCompute<T, PType>::Run() {
auto& param =
this->template Param<operators::FusionElementwiseActivationParam>();
auto* x_data = param.X->template data<T>();
auto* y_data = param.Y->template data<T>();
auto* out_data = param.Out->template mutable_data<T>();
int axis = param.axis;
std::string act_type = param.act_type;
auto x_dims = param.X->dims();
......@@ -221,21 +217,21 @@ void ElementwiseMulActivationCompute::Run() {
if (x_dims.size() < y_dims.size() &&
is_broadcast(y_dims, x_dims, axis, &pre, &n, &post)) {
if (act_type == "relu") {
lite::arm::math::elementwise_mul_relu_broadcast<float>(
lite::arm::math::elementwise_mul_relu_broadcast<T>(
y_data, x_data, out_data, pre, n, post);
} else {
LOG(FATAL) << "unsupported Activation type: " << act_type;
}
} else if (is_broadcast(x_dims, y_dims, axis, &pre, &n, &post)) {
if (act_type == "relu") {
lite::arm::math::elementwise_mul_relu_broadcast(
lite::arm::math::elementwise_mul_relu_broadcast<T>(
x_data, y_data, out_data, pre, n, post);
} else {
LOG(FATAL) << "unsupported Activation type: " << act_type;
}
} else {
if (act_type == "relu") {
lite::arm::math::elementwise_mul_relu(
lite::arm::math::elementwise_mul_relu<T>(
x_data, y_data, out_data, x_dims.production());
} else {
LOG(FATAL) << "unsupported Activation type: " << act_type;
......@@ -426,46 +422,60 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
using elementwise_mul_float =
using elementwise_mul_float_t =
paddle::lite::kernels::arm::ElementwiseMulCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(
elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float, def)
elementwise_mul, kARM, kFloat, kNCHW, elementwise_mul_float_t, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
using elementwise_mul_int32 =
using elementwise_mul_int32_t =
paddle::lite::kernels::arm::ElementwiseMulCompute<int, PRECISION(kInt32)>;
REGISTER_LITE_KERNEL(
elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32, def)
elementwise_mul, kARM, kInt32, kNCHW, elementwise_mul_int32_t, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
.Finalize();
using elementwise_mul_int64 =
using elementwise_mul_int64_t =
paddle::lite::kernels::arm::ElementwiseMulCompute<int64_t,
PRECISION(kInt64)>;
REGISTER_LITE_KERNEL(
elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64, def)
elementwise_mul, kARM, kInt64, kNCHW, elementwise_mul_int64_t, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.Finalize();
REGISTER_LITE_KERNEL(
fusion_elementwise_mul_activation,
kARM,
kFloat,
kNCHW,
paddle::lite::kernels::arm::ElementwiseMulActivationCompute,
def)
using fusion_elementwise_mul_activation_float_t = paddle::lite::kernels::arm::
ElementwiseMulActivationCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
kARM,
kFloat,
kNCHW,
fusion_elementwise_mul_activation_float_t,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
using fusion_elementwise_mul_activation_int64_t = paddle::lite::kernels::arm::
ElementwiseMulActivationCompute<int64_t, PRECISION(kInt64)>;
REGISTER_LITE_KERNEL(fusion_elementwise_mul_activation,
kARM,
kInt64,
kNCHW,
fusion_elementwise_mul_activation_int64_t,
def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.Finalize();
REGISTER_LITE_KERNEL(elementwise_max,
kARM,
kFloat,
......@@ -489,22 +499,22 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
using elementwise_div_fp32 =
using elementwise_div_fp32_t =
paddle::lite::kernels::arm::ElementwiseDivCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(
elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32, def)
elementwise_div, kARM, kFloat, kNCHW, elementwise_div_fp32_t, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
using elementwise_div_int64 =
using elementwise_div_int64_t =
paddle::lite::kernels::arm::ElementwiseDivCompute<int64_t,
PRECISION(kInt64)>;
REGISTER_LITE_KERNEL(
elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64, def)
elementwise_div, kARM, kInt64, kNCHW, elementwise_div_int64_t, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
......@@ -522,11 +532,11 @@ REGISTER_LITE_KERNEL(
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
.Finalize();
using elementwise_mod_int64 =
using elementwise_mod_int64_t =
paddle::lite::kernels::arm::ElementwiseModCompute<int64_t,
PRECISION(kInt64)>;
REGISTER_LITE_KERNEL(
elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64, def)
elementwise_mod, kARM, kInt64, kNCHW, elementwise_mod_int64_t, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
......
......@@ -62,8 +62,8 @@ class ElementwiseMulCompute : public KernelLite<TARGET(kARM), PType> {
virtual ~ElementwiseMulCompute() = default;
};
class ElementwiseMulActivationCompute
: public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class ElementwiseMulActivationCompute : public KernelLite<TARGET(kARM), PType> {
public:
void Run() override;
......
......@@ -533,13 +533,15 @@ TEST(fusion_elementwise_mul_activation_arm, retrive_op) {
}
TEST(fusion_elementwise_mul_activation_arm, init) {
ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
fusion_elementwise_mul_activation;
ASSERT_EQ(fusion_elementwise_mul_activation.precision(), PRECISION(kFloat));
ASSERT_EQ(fusion_elementwise_mul_activation.target(), TARGET(kARM));
}
TEST(fusion_elementwise_mul_activation_arm, compute) {
ElementwiseMulActivationCompute fusion_elementwise_mul_activation;
ElementwiseMulActivationCompute<float, PRECISION(kFloat)>
fusion_elementwise_mul_activation;
operators::FusionElementwiseActivationParam param;
lite::Tensor x, y, output, output_ref;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册