未验证 提交 fdf44c59 编写于 作者: L Leonardo-Ding 提交者: GitHub

Merge branch 'develop' into dwh_dev

......@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_HUAWEI_ASCEND_NPU "Enable HUAWEI_ASCEND_NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
......@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OF
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
......@@ -224,6 +226,11 @@ endif()
if(LITE_WITH_MLU)
include(mlu)
endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
include(device/huawei_ascend_npu)
endif()
include(coveralls)
include(external/mklml) # download mklml package
......
......@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
add_definitions("-DLITE_WITH_MLU")
endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
endif()
......@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
add_definitions("-DLITE_WITH_LOG")
endif()
if (LITE_WITH_EXCEPTION)
add_definitions("-DLITE_WITH_EXCEPTION")
endif()
if (LITE_ON_TINY_PUBLISH)
add_definitions("-DLITE_ON_TINY_PUBLISH")
endif()
......
......@@ -80,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(triple arm-v7a-linux-android)
set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
# Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
if (ANDROID_API_LEVEL LESS 21)
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
endif()
if(ANDROID_STL_TYPE STREQUAL "c++_shared")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
else()
message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
endif()
endif()
else()
message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
endif()
......
......@@ -23,6 +23,21 @@ if(ANDROID)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
# Don't re-export libgcc symbols
set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
# Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
# Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
endif()
endif()
if(ARMLINUX)
......@@ -59,14 +74,13 @@ function(check_linker_flag)
endfunction()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
else ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
endif()
if (LITE_ON_TINY_PUBLISH)
if((NOT LITE_WITH_PYTHON))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
endif()
if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
check_linker_flag(-Wl,--gc-sections)
endif()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
# 1. path to Huawei Ascend Install Path
if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
endif()
endif()
message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
# 2. Huawei Ascend include directory
set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
include_directories(${ACL_INCLUDE_DIR})
include_directories(${ATC_INCLUDE_DIR})
include_directories(${OPP_INCLUDE_DIR})
# 3 find ACL Libs (ACL libs should before ATC libs)
find_library(ACL_ASCENDCL_FILE NAMES ascendcl
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_ASCENDCL_FILE)
message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
endif()
# 3.1 ascendcl dependency - libruntime.so
find_library(ACL_RUNTIME_FILE NAMES runtime
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_RUNTIME_FILE)
message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
add_library(acl_runtime SHARED IMPORTED GLOBAL)
set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
endif()
# 4.1 find ATC libs - libregister.so
find_library(ATC_REGISTER_FILE NAMES register
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
add_library(atc_register SHARED IMPORTED GLOBAL)
set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
endif()
# 4.1.1 dependency of register - libprotobuf.so.19,
find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
add_library(atc_protobuf SHARED IMPORTED GLOBAL)
set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
endif()
# 4.1.2 dependency of register - libgraph.so
find_library(ATC_GRAPH_FILE NAMES graph
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GRAPH_FILE)
message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
add_library(atc_graph SHARED IMPORTED GLOBAL)
set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
endif()
# 4.2 find ATC libs - libge_compiler.so
find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMPILER_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
endif()
# 4.2.1 dependencies of libge_compiler.so - libge_common.so
find_library(ATC_GE_COMMON_FILE NAMES ge_common
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMMON_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
add_library(atc_ge_common SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
endif()
# 4.2.3 dependencies of libge_compiler.so - libresource.so
find_library(ATC_RESOURCE_FILE NAMES resource
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_RESOURCE_FILE)
message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
add_library(atc_resource SHARED IMPORTED GLOBAL)
set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
endif()
# 4.3 find OPP libs - libopsproto.so
find_library(OPP_OPS_PROTO_FILE NAMES opsproto
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
NO_DEFAULT_PATH)
if(NOT OPP_OPS_PROTO_FILE)
message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
else()
message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
endif()
# 4.3.1 dependency of opp_ops_proto - liberror_manager.so
find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_ERROR_MANAGER_FILE)
message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
add_library(atc_error_manager SHARED IMPORTED GLOBAL)
set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
endif()
# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager
atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
......@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
# Added in HiAI DDK 320 or later version
find_library(NPU_DDK_HCL_FILE NAMES hcl
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
if(NOT NPU_DDK_HIAI_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
else()
......@@ -78,5 +83,13 @@ else()
set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
endif()
set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
if(NOT NPU_DDK_HCL_FILE)
# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
else()
message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
endif()
set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
......@@ -94,12 +94,10 @@ function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
add_custom_command(
OUTPUT ${GEN_HEADER}
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/${GEN_HEADER}"
COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
--cpp --gen-mutable --gen-object-api --reflect-names
--force-empty --force-empty-vectors
${OPT}
-I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
-o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
"${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
DEPENDS flatbuffers
......
......@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -118,6 +118,12 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE)
endfunction()
......@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
if (args_SHARED OR ARGS_shared)
......@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
if(NOT WIN32)
......@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL # (basic|extra)
......@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size
......@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
set(apu_kernels CACHE INTERNAL "apu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels")
set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
......@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS})
......@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -499,7 +519,7 @@ endif()
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -537,6 +557,7 @@ function(add_operator TARGET level)
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......
......@@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h
rm ./lite/api/paddle_use_ops.h
# 设置编译参数并开始编译
# android-armv7:cpu+gpu+cv+extra
./lite/tools/build_android.sh \
--arch=armv7 \
--toolchain=clang \
--with_cv=OFF \
--with_log=OFF \
--with_extra=OFF \
--with_extra=ON \
--with_cv=ON \
--with_opencl=ON
# android-armv8:cpu+gpu+cv+extra
./lite/tools/build_android.sh \
--arch=armv8 \
--toolchain=clang \
--with_log=OFF \
--with_extra=ON \
--with_cv=ON \
--with_opencl=ON
# 注:编译帮助请执行: ./lite/tools/build_android.sh help
```
......@@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \
## 3. 如何在Code中使用
即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc),其中也包括判断当前设备是否支持OpenCL的方法;
注:这里给出的链接会跳转到线上最新develop分支的代码,很可能与您本地的代码存在差异,建议参考自己本地位于`lite/demo/cxx/`目录的代码,查看如何使用。
......
......@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
......@@ -45,6 +46,7 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
......
......@@ -11,7 +11,7 @@ endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full)
......@@ -40,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if(WIN32)
target_link_libraries(paddle_light_api_shared shlwapi.lib)
endif()
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
if(APPLE)
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
......@@ -94,6 +95,7 @@ if (WITH_TESTING)
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
APU_DEPS ${apu_kernels})
endif()
......@@ -112,6 +114,10 @@ if(LITE_WITH_RKNPU)
set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
endif()
message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}")
......@@ -126,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}")
message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")
# for full api
if (NOT LITE_ON_TINY_PUBLISH)
......@@ -144,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
FPGA_DEPS ${fpga_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif()
# for light api
......@@ -168,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......@@ -191,6 +200,7 @@ if(WITH_TESTING)
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -322,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels})
BM_DEPS ${bm_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
# The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps})
......@@ -394,6 +405,7 @@ if(NOT WITH_COVERAGE)
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
......@@ -415,7 +427,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -430,7 +443,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -445,7 +459,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -459,7 +474,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -470,8 +486,9 @@ if(NOT IOS)
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -487,7 +504,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
......@@ -37,8 +37,7 @@ void Predictor::SaveModel(const std::string &dir,
if (!program_) {
GenRuntimeProgram();
}
program_->SaveOpInfosToProgram(program_desc_.get());
program_->UpdateVarsOfProgram(program_desc_.get());
program_->SaveToProgram(program_desc_);
switch (model_type) {
case lite_api::LiteModelType::kProtobuf:
SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
......@@ -58,17 +57,21 @@ void Predictor::SaveModel(const std::string &dir,
void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
std::set<std::string> ops_info;
std::set<std::string> kernels_info;
const auto &instructions_ = program_->instructions();
for (auto &node : instructions_) {
// parse op type infomation
auto op = node.op()->op_info();
ops_info.insert(op->Type());
// parse kernel type information
std::string kernel_type_str =
node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
"," + PrecisionRepr(node.kernel()->precision()) + "," +
DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
kernels_info.insert(kernel_type_str);
auto block_size = program_->block_size();
for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
const auto &insts = program_->instructions(block_idx);
for (auto &inst : insts) {
// parse op type infomation
auto op = inst.op()->op_info();
ops_info.insert(op->Type());
// parse kernel type information
std::string kernel_type_str =
inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
"," + PrecisionRepr(inst.kernel()->precision()) + "," +
DataLayoutRepr(inst.kernel()->layout()) + "," +
inst.kernel()->alias();
kernels_info.insert(kernel_type_str);
}
}
// get souce_file name from op type and kernel type
......@@ -170,9 +173,9 @@ void Predictor::PrepareFeedFetch() {
std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
const auto &insts = program_->instructions(kRootBlockIdx);
for (auto &inst : insts) {
const auto &op = inst.op()->op_info();
if (op->Type() == "feed") {
feeds.push_back(op);
} else if (op->Type() == "fetch") {
......@@ -255,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
} else {
LOG(INFO) << "Load model from file.";
}
Build(model_path,
model_file,
param_file,
......@@ -296,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
Build(program_desc_, valid_places, passes);
}
void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) {
program_desc_ = desc;
program_desc_ = program_desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places;
for (auto &valid_place : valid_places) {
......@@ -336,7 +338,7 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
Place{TARGET(kARM), PRECISION(kInt8)});
}
Program program(*desc.get(), scope_, inner_places);
Program program(program_desc_, scope_, inner_places);
valid_places_ = inner_places;
core::KernelPickFactor factor;
......
......@@ -49,18 +49,33 @@ class LITE_API Predictor {
program_desc_ = std::make_shared<cpp::ProgramDesc>();
}
// Create a predictor with the weight variable scope set.
///////////////////////////////////////////////////////////////////
// Function: Predictor
// Usage: Constructor of Predictor. Create a predictor with the
// weight variable scope set given.
///////////////////////////////////////////////////////////////////
explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
: scope_(root_scope) {}
Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
///////////////////////////////////////////////////////////////////
// Function: Predictor
// Usage: Constructor of Predictor. This constructor function can
// only be called in Predictor->Clone. This Function will create
// a predictor from existed ProgramDesc, Scope and RuntimeProgram.
///////////////////////////////////////////////////////////////////
Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::shared_ptr<Scope>& root,
const std::vector<Place>& valid_places,
const std::vector<std::string>& var_names = {})
: program_desc_(desc), scope_(root) {
Program program(*desc.get(), scope_, valid_places, var_names);
optimizer_ = Optimizer(std::move(program), valid_places);
exec_scope_ = optimizer_.exec_scope();
: program_desc_(program_desc), scope_(root) {
// step1. Create a Program to construct the exec_scope and ops
Program program(program_desc_, scope_, valid_places, var_names);
exec_scope_ = program.exec_scope();
valid_places_ = valid_places;
// step3. Create the RuntimeProgram.
program_.reset(
new RuntimeProgram(program_desc_, exec_scope_, kRootBlockIdx));
program_generated_ = true;
}
// Build from a model, with places set for hardware config.
......@@ -79,32 +94,62 @@ class LITE_API Predictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool memory_from_memory = false);
void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::vector<Place>& valid_places,
const std::vector<std::string>& passes = {});
std::shared_ptr<Predictor> Clone() const {
//////////////////////////////////////////////////////////
// Function: Clone
// Usage: Create a Predictor from an existed one,
// the cloned predictor will share persistable variables
// in scope_ with the original predictor.
//////////////////////////////////////////////////////////
std::shared_ptr<Predictor> Clone() {
// step 1. Generate runtime_program, update op_info and var_info in
// program_desc_
if (!program_generated_) {
GenRuntimeProgram();
}
program_->SaveToProgram(program_desc_);
// step 2. Create a predictor friom current program_desc_ and
// runtime_program.
auto predictor =
std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
// step3. Return the result
return predictor;
}
std::shared_ptr<Predictor> Clone(
const std::vector<std::string>& var_names) const {
//////////////////////////////////////////////////////////
// Function: Clone(var_names)
// Usage: Create a Predictor from an existed one,
// the cloned predictor will share persistable variables
// but persistable variables of name var_names will not
// be shared.
//////////////////////////////////////////////////////////
std::shared_ptr<Predictor> Clone(const std::vector<std::string>& var_names) {
CHECK(program_desc_) << "Both program and scope of current predicotr "
"should be not be nullptr in Clone mode.";
CHECK(scope_) << "Both program and scope of current predicotr should be "
"not be nullptr in Clone mode.";
// step 1. Generate runtime_program, update op_info and var_info in
// program_desc_
if (!program_generated_) {
GenRuntimeProgram();
}
program_->SaveToProgram(program_desc_);
// step 2. Create a predictor friom current program_desc_ and
// runtime_program.
auto predictor = std::make_shared<Predictor>(
program_desc_, scope_, valid_places_, var_names);
for (auto i : var_names) {
predictor->exec_scope_->LocalVar(i);
auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
// step3. Copy some persistable variables into private scope.
for (auto var_name : var_names) {
predictor->exec_scope_->LocalVar(var_name);
auto* tensor =
predictor->scope_->Var(var_name)->GetMutable<lite::Tensor>();
auto* sub_tensor =
predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
sub_tensor->CopyDataFrom(*tensor);
}
// step4. Return the result
return predictor;
}
......@@ -140,6 +185,7 @@ class LITE_API Predictor {
// get a const tensor according to its name
const lite::Tensor* GetTensor(const std::string& name) const;
const RuntimeProgram& runtime_program() const;
Scope* scope() { return scope_.get(); }
// This method is disabled in mobile, for unnecessary dependencies required.
void SaveModel(
......@@ -162,7 +208,7 @@ class LITE_API Predictor {
std::shared_ptr<cpp::ProgramDesc> program_desc_;
std::shared_ptr<Scope> scope_;
Scope* exec_scope_;
std::unique_ptr<RuntimeProgram> program_;
std::shared_ptr<RuntimeProgram> program_;
bool program_generated_{false};
std::vector<std::string> input_names_;
std::vector<std::string> output_names_;
......
......@@ -74,7 +74,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
mode_ = config.power_mode();
threads_ = config.threads();
#ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
......
......@@ -22,17 +22,18 @@ namespace lite {
void LightPredictor::Build(const std::string& lite_model_file,
bool model_from_memory) {
if (model_from_memory) {
LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
LoadModelNaiveFromMemory(
lite_model_file, scope_.get(), program_desc_.get());
} else {
LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
}
// For weight quantization of post training, load the int8/16 weights
// for optimized model, and dequant it to fp32.
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_);
BuildRuntimeProgram(program_desc_);
PrepareFeedFetch();
program_desc_.reset();
}
void LightPredictor::Build(const std::string& model_dir,
......@@ -43,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir,
switch (model_type) {
#ifndef LITE_ON_TINY_PUBLISH
case lite_api::LiteModelType::kProtobuf:
LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
break;
#endif
case lite_api::LiteModelType::kNaiveBuffer: {
if (model_from_memory) {
LoadModelNaiveFromMemory(
model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
model_buffer, param_buffer, scope_.get(), program_desc_.get());
} else {
LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
}
break;
}
......@@ -60,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir,
}
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_);
BuildRuntimeProgram(program_desc_);
PrepareFeedFetch();
}
......@@ -109,15 +110,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
}
// append the names of inputs and outputs into input_names_ and output_names_
void LightPredictor::PrepareFeedFetch() {
auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
std::vector<cpp::OpDesc*> feeds;
std::vector<cpp::OpDesc*> fetchs;
for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i);
if (op->Type() == "feed") {
feeds.push_back(op);
} else if (op->Type() == "fetch") {
fetchs.push_back(op);
std::vector<const cpp::OpDesc*> feeds;
std::vector<const cpp::OpDesc*> fetchs;
std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
auto op_size = main_block->OpsSize();
for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
if (op_desc->Type() == "feed") {
feeds.push_back(op_desc);
} else if (op_desc->Type() == "fetch") {
fetchs.push_back(op_desc);
}
}
input_names_.resize(feeds.size());
......@@ -132,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() {
}
}
void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
std::vector<Instruction> insts;
// 1. Create op first
Program program(prog, scope_, {});
// 2. Create Instructs
#ifdef LITE_WITH_OPENCL
using OpenCLContext = Context<TargetType::kOpenCL>;
std::unique_ptr<KernelContext> local_ctx(new KernelContext());
local_ctx->As<OpenCLContext>().InitOnce();
#endif
// Create the kernels of the target places, and filter out the specific
// kernel with the target alias.
for (auto& op : program.ops()) {
auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
std::string op_type, alias;
Place place;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
auto kernels = op->CreateKernels({place});
// filter out a kernel
auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
return it->alias() == alias;
});
CHECK(it != kernels.end());
#ifdef LITE_WITH_OPENCL
if ((*it)->target() == TARGET(kOpenCL)) {
std::unique_ptr<KernelContext> ctx(new KernelContext());
(*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
(*it)->SetContext(std::move(ctx));
} else {
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
void LightPredictor::BuildRuntimeProgram(
const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
auto* exe_scope = &scope_->NewScope();
// Prepare workspace
scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
CHECK(program_desc);
auto block_size = program_desc->BlocksSize();
CHECK(block_size);
for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
auto var_size = block_desc->VarsSize();
for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
if (!var_desc->Persistable()) {
exe_scope->Var(var_desc->Name());
} else {
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
scope_->Var(var_desc->Name());
}
}
#else
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
#endif
insts.emplace_back(op, std::move(*it));
}
program_.reset(new RuntimeProgram(std::move(insts)));
CHECK(program.exec_scope());
program_->set_exec_scope(program.exec_scope());
// Only extracting the ops and generate the runtime program from the main
// block desc
program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
}
void LightPredictor::DequantizeWeight() {
std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
#define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < ch; ++i) { \
for (int64_t j = 0; j < offset; ++j) { \
......@@ -205,10 +189,9 @@ void LightPredictor::DequantizeWeight() {
}
return result;
};
Tensor tmp_tensor;
for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
for (size_t k = 0; k < block->OpsSize(); ++k) {
auto* op_desc = block->GetOp<cpp::OpDesc>(k);
if (is_weight_quantized_op(op_desc)) {
......
......@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
LightPredictor(const std::string& lite_model_file,
bool model_from_memory = false) {
scope_ = std::make_shared<Scope>();
program_desc_ = std::make_shared<cpp::ProgramDesc>();
Build(lite_model_file, model_from_memory);
}
......@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
lite_api::LiteModelType model_type =
lite_api::LiteModelType::kNaiveBuffer) {
scope_ = std::make_shared<Scope>();
program_desc_ = std::make_shared<cpp::ProgramDesc>();
Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
}
......@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames();
void PrepareFeedFetch();
Scope* scope() { return scope_.get(); }
private:
void Build(const std::string& lite_model_file,
......@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool model_from_memory = false);
void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
void BuildRuntimeProgram(
const std::shared_ptr<const cpp::ProgramDesc>& program_desc);
void DequantizeWeight();
private:
std::shared_ptr<Scope> scope_;
std::unique_ptr<RuntimeProgram> program_;
cpp::ProgramDesc cpp_program_desc_;
std::shared_ptr<cpp::ProgramDesc> program_desc_;
std::vector<std::string> input_names_;
std::vector<std::string> output_names_;
};
......
......@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
threads_ = config.threads();
#ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
}
......
......@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
double eps = first_target == TARGET(kOpenCL) ? 0.25 : 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
......
......@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
} else if (target_repr == "npu") {
valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") {
......@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kXPU",
"kRKNPU",
"kAPU",
"kHuaweiAscendNPU",
"kAny",
"kUnk"};
int maximum_optype_length = 0;
......@@ -265,16 +268,17 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" "
"`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n"
" `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
......
......@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") {
valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") {
......@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
" `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
"default\n"
" `set_lite_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" "
"`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `record_model_info(false|true)`: refer to whether to record ops "
"info for striping lib, false by default`\n"
" `run() : start model transformation`\n"
......@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
" `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n"
" `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display operators in the input model\n";
std::cout << "paddlelite opt version:" << opt_version << std::endl
<< help_info << std::endl;
......@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
"kXPU",
"kRKNPU",
"kAPU",
"kHuaweiAscendNPU",
"kAny",
"kUnk"};
// Get the lengh of the first column: maximum length of the op_type
......
......@@ -32,9 +32,22 @@
#include "lite/backends/mlu/target_wrapper.h"
#endif
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_runtime.h"
#endif
namespace paddle {
namespace lite_api {
bool IsOpenCLBackendValid() {
bool opencl_valid = false;
#ifdef LITE_WITH_OPENCL
opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice();
#endif
LOG(INFO) << "opencl_valid:" << opencl_valid;
return opencl_valid;
}
Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
// TODO(Superjomn) refine this by using another `const void* const_raw`;
......
......@@ -33,6 +33,9 @@ using lod_t = std::vector<std::vector<uint64_t>>;
enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
// return true if current device supports OpenCL model
LITE_API bool IsOpenCLBackendValid();
struct LITE_API Tensor {
explicit Tensor(void* raw);
explicit Tensor(const void* raw);
......@@ -123,6 +126,7 @@ class LITE_API ConfigBase {
PowerMode mode_{LITE_POWER_NO_BIND};
// to save subgraph model for npu/xpu/...
std::string subgraph_model_cache_dir_{""};
int device_id_{0};
public:
explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
......@@ -142,6 +146,9 @@ class LITE_API ConfigBase {
const std::string& subgraph_model_cache_dir() const {
return subgraph_model_cache_dir_;
}
// set Device ID
void set_device_id(int device_id) { device_id_ = device_id; }
const int get_device_id() const { return device_id_; }
};
/// CxxConfig is the config for the Full feature predictor.
......
......@@ -75,7 +75,8 @@ const std::string& TargetToStr(TargetType target) {
"bm",
"mlu",
"rknpu",
"apu"};
"apu",
"huawei_ascend_npu"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -118,7 +119,8 @@ const std::string& TargetRepr(TargetType target) {
"kBM",
"kMLU",
"kRKNPU",
"kAPU"};
"kAPU",
"kHuaweiAscendNPU"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -163,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kMLU),
TARGET(kAPU),
TARGET(kRKNPU),
TARGET(kFPGA)});
TARGET(kFPGA),
TARGET(kHuaweiAscendNPU)});
if (target == TARGET(kAny)) {
return valid_set;
}
......
......@@ -57,7 +57,8 @@ enum class TargetType : int {
kMLU = 11,
kRKNPU = 12,
kAPU = 13,
NUM = 14, // number of fields.
kHuaweiAscendNPU = 14,
NUM = 15, // number of fields.
};
enum class PrecisionType : int {
kUnk = 0,
......
......@@ -28,6 +28,7 @@ USE_MIR_PASS(graph_visualize_pass);
USE_MIR_PASS(remove_tf_redundant_ops_pass);
USE_MIR_PASS(lite_conv_bn_fuse_pass);
USE_MIR_PASS(lite_conv_conv_fuse_pass);
USE_MIR_PASS(lite_fc_fuse_pass);
USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
......@@ -47,12 +48,14 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(multi_stream_analysis_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
......
......@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
.value("MLU", TargetType::kMLU)
.value("RKNPU", TargetType::kRKNPU)
.value("APU", TargetType::kAPU)
.value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
.value("Any", TargetType::kAny);
// PrecisionType
......
......@@ -10,3 +10,4 @@ add_subdirectory(mlu)
add_subdirectory(bm)
add_subdirectory(apu)
add_subdirectory(rknpu)
add_subdirectory(huawei_ascend_npu)
......@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
selected_ids->Resize(dims);
selected_scores->Resize(dims);
if (parent_idx) {
parent_idx->Resize(dims);
parent_idx->Resize({static_cast<int64_t>(num_instances)});
}
auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
auto *selected_scores_data = selected_scores->mutable_data<float>();
......
......@@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din,
}
return true;
}
// for example: m = 4, n = 4
// din = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9 , 10 ,11], [12, 13, 14, 15]]
// dout = [[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]
/*
m = 8 n = 8: 0 1 2 3 4 5 6 7 0 8 16 24 32 40 48 56
16 17 18 19 20 21 22 23 2 10 18 26 34 42 50 58
24 25 26 27 28 29 30 31 3 11 19 27 35 43 51 59
32 33 34 35 36 37 38 39 4 12 20 28 36 44 52 60 ...
}
}
*/
template <typename Dtype>
void local_transpose(const Dtype* din, Dtype* dout, int m, int n) {
// n % 4 == 0 && m % 4 == 0
// n * m ==> n * m data trans
int offset_m = m << 2;
const Dtype* din_ptr = din;
Dtype* dout_ptr = dout;
for (int i = 0; i < n; i += 4) {
Dtype* out_ptr0 = dout_ptr;
Dtype* out_ptr1 = dout_ptr + m;
Dtype* out_ptr2 = out_ptr1 + m;
Dtype* out_ptr3 = out_ptr2 + m;
const Dtype* in_ptr0 = din_ptr;
const Dtype* in_ptr1 = din_ptr + m;
const Dtype* in_ptr2 = in_ptr1 + m;
const Dtype* in_ptr3 = in_ptr2 + m;
for (int j = 0; j < m; j += 4) {
float32x4_t vin0 = vld1q_f32(in_ptr0);
float32x4_t vin1 = vld1q_f32(in_ptr1);
float32x4_t vin2 = vld1q_f32(in_ptr2);
float32x4_t vin3 = vld1q_f32(in_ptr3);
// a00 b00 a02 b02 a01 b01 a03 b03
float32x4x2_t tmp0 = vtrnq_f32(vin0, vin1);
// c00 d00 c02 d02 c01 d01 c03 d03
float32x4x2_t tmp2 = vtrnq_f32(vin2, vin3);
in_ptr0 = in_ptr3 + m;
in_ptr1 = in_ptr3 + 2 * m;
float tmp_val1 = tmp0.val[0][2];
float tmp_val2 = tmp0.val[0][3];
tmp0.val[0][2] = tmp2.val[0][0];
tmp0.val[0][3] = tmp2.val[0][1];
float tmp_val3 = tmp0.val[1][2];
float tmp_val4 = tmp0.val[1][3];
tmp2.val[0][0] = tmp_val1;
tmp2.val[0][1] = tmp_val2;
tmp0.val[1][2] = tmp2.val[1][0];
tmp0.val[1][3] = tmp2.val[1][1];
tmp2.val[1][0] = tmp_val3;
tmp2.val[1][1] = tmp_val4;
in_ptr2 = in_ptr1 + m;
in_ptr3 = in_ptr1 + 2 * m;
vst1q_f32(out_ptr0, tmp0.val[0]);
vst1q_f32(out_ptr1, tmp0.val[1]);
out_ptr0 += 4;
out_ptr1 += 4;
vst1q_f32(out_ptr2, tmp2.val[0]);
vst1q_f32(out_ptr3, tmp2.val[1]);
out_ptr2 += 4;
out_ptr3 += 4;
}
dout_ptr += offset_m;
din_ptr += 4;
}
}
template <typename Dtype>
void transpose(const Dtype* din, Dtype* dout, int m, int n) {
// nxm == mxn
// 4x4
int cnt_n = n >> 2;
int remain_n = n & 3;
int cnt_m = m >> 2;
int remain_m = m & 3;
int nn_num = n << 2; // n * 4
int mm_num = m << 2; // m * 4
for (int x = 0; x < cnt_n; x++) {
const Dtype* din_ptr0 = din + x * mm_num;
const Dtype* din_ptr1 = din_ptr0 + m;
const Dtype* din_ptr2 = din_ptr1 + m;
const Dtype* din_ptr3 = din_ptr2 + m;
Dtype* dout_ptr0 = dout + x * 4;
for (int y = 0; y < cnt_m; y++) {
float32x4_t din0 = vld1q_f32(din_ptr0); // a00 a01 a02 a03
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
Dtype* dout_ptr1 = dout_ptr0 + n;
Dtype* dout_ptr2 = dout_ptr1 + n;
Dtype* dout_ptr3 = dout_ptr2 + n;
// a00 b00 a02 b02 a01 b01 a03 b03
float32x4x2_t tmp0 = vtrnq_f32(din0, din1);
// c00 d00 c02 d02 c01 d01 c03 d03
float32x4x2_t tmp2 = vtrnq_f32(din2, din3);
din_ptr0 += 4;
din_ptr1 += 4;
// a00 b00 c00 d00 a02 b02 c02 d02
// a01 b01 c01 d01 a03 b03 c03 d03
float tmp_val1 = tmp0.val[0][2];
float tmp_val2 = tmp0.val[0][3];
tmp0.val[0][2] = tmp2.val[0][0];
tmp0.val[0][3] = tmp2.val[0][1];
float tmp_val3 = tmp0.val[1][2];
float tmp_val4 = tmp0.val[1][3];
tmp2.val[0][0] = tmp_val1;
tmp2.val[0][1] = tmp_val2;
tmp0.val[1][2] = tmp2.val[1][0];
tmp0.val[1][3] = tmp2.val[1][1];
tmp2.val[1][0] = tmp_val3;
tmp2.val[1][1] = tmp_val4;
din_ptr2 += 4;
din_ptr3 += 4;
vst1q_f32(dout_ptr0, tmp0.val[0]);
vst1q_f32(dout_ptr1, tmp0.val[1]);
dout_ptr0 += nn_num;
vst1q_f32(dout_ptr2, tmp2.val[0]);
vst1q_f32(dout_ptr3, tmp2.val[1]);
}
for (int y = 0; y < remain_m; y++) {
*dout_ptr0++ = *din_ptr0++;
*dout_ptr0++ = *din_ptr1++;
*dout_ptr0++ = *din_ptr2++;
*dout_ptr0++ = *din_ptr3++;
}
}
const Dtype* din_ptr0 = din + cnt_n * mm_num;
dout = dout + cnt_n * 4;
for (int x = 0; x < remain_n; x++) {
Dtype* dout_ptr0 = dout + x * 4;
for (int y = 0; y < cnt_m; y++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
Dtype* dout_ptr1 = dout_ptr0 + n;
Dtype* dout_ptr2 = dout_ptr1 + n;
Dtype* dout_ptr3 = dout_ptr2 + n;
din_ptr0 += 4;
*dout_ptr0 = din0[0];
*dout_ptr1 = din0[1];
dout_ptr0 += nn_num;
*dout_ptr2 = din0[2];
*dout_ptr3 = din0[3];
}
for (int y = 0; y < remain_m; y++) {
*dout_ptr0++ = *din_ptr0++;
}
}
}
/*preprocessing inputs
* input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
* n = he - hs
......
......@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
}
}
template <>
void elementwise_mul<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
dout[i] = dinx[i] * diny[i];
}
}
template <>
void elementwise_mul_relu<float>(const float* dinx,
const float* diny,
......@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
}
}
template <>
void elementwise_mul_relu<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
int64_t tmp = dinx[i] * diny[i];
dout[i] = tmp > 0 ? tmp : 0;
}
}
template <>
void elementwise_mul_broadcast<float>(const float* dinx,
const float* diny,
......@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
}
}
template <>
void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* dinx_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *dinx_ptr * diny_data;
dout_ptr++;
dinx_ptr++;
}
}
}
}
template <>
void elementwise_mul_relu_broadcast<float>(const float* dinx,
const float* diny,
......@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
}
}
template <>
void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* dinx_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int k = 0; k < num; ++k) {
int64_t tmp = *dinx_ptr * diny_data;
*dout_ptr = tmp > 0 ? tmp : 0;
dout_ptr++;
dinx_ptr++;
}
}
}
}
template <>
void elementwise_max<float>(const float* dinx,
const float* diny,
......
......@@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din,
} else {
if (pad_bottom > 1) {
coef_h = 1.f / 3;
} else if (pad_bottom = 1) {
} else if (pad_bottom == 1) {
coef_h = 0.5f;
} else {
coef_h = 1.f;
......
......@@ -21,7 +21,7 @@ namespace lite {
namespace arm {
namespace math {
const int MALLOC_ALIGN = 64;
const int MALLOC_ALIGN = 16;
void* fast_malloc(size_t size) {
size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
......
......@@ -46,11 +46,60 @@ void seq_pool_sum<float>(const float* din,
memcpy(dout_ptr, din_ptr, width * sizeof(float));
din_ptr += width;
height = height - 1;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; ++w) {
dout_ptr[w] += din_ptr[w];
int cnt_w = width >> 2;
int remain_w = width & 3;
int cnt_h = height >> 2;
int remain_h = height & 3;
int stride = width << 2;
for (int w = 0; w < cnt_w; w++) {
const float* din_ptr0 = din_ptr + w * 4;
float32x4_t dout_val = vld1q_f32(dout_ptr);
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
dout_val = vaddq_f32(din0, dout_val);
float32x4_t tmp = vaddq_f32(din1, din2);
din_ptr0 += stride;
din_ptr1 += stride;
dout_val = vaddq_f32(din3, dout_val);
din_ptr2 += stride;
din_ptr3 += stride;
dout_val = vaddq_f32(tmp, dout_val);
}
din_ptr += width;
for (int h = 0; h < remain_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
dout_val = vaddq_f32(din0, dout_val);
din_ptr0 += width;
}
vst1q_f32(dout_ptr, dout_val);
dout_ptr += 4;
}
const float* din_ptr00 = din_ptr + cnt_w * 4;
for (int w = 0; w < remain_w; w++) {
const float* din_ptr0 = din_ptr00 + w;
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
*dout_ptr += din_ptr0[0];
float tmp = din_ptr1[0] + din_ptr2[0];
din_ptr0 += stride;
din_ptr1 += stride;
*dout_ptr += din_ptr3[0];
din_ptr2 += stride;
din_ptr3 += stride;
*dout_ptr += tmp;
}
for (int h = 0; h < remain_h; h++) {
*dout_ptr += din_ptr0[0];
din_ptr0 += width;
}
dout_ptr++;
}
}
}
......@@ -144,12 +193,62 @@ void seq_pool_max<float>(const float* din,
} else {
memcpy(dout_ptr, din_ptr, width * sizeof(float));
din_ptr += width;
int remain_h = height - 1;
for (int h = 0; h < remain_h; h++) {
for (int w = 0; w < width; w++) {
dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]);
height = height - 1;
int cnt_w = width >> 2;
int remain_w = width & 3;
int cnt_h = height >> 2;
int remain_h = height & 3;
int stride = width << 2;
for (int w = 0; w < cnt_w; w++) {
const float* din_ptr0 = din_ptr + w * 4;
float32x4_t dout_val = vld1q_f32(dout_ptr);
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
dout_val = vmaxq_f32(din0, dout_val);
float32x4_t tmp = vmaxq_f32(din1, din2);
din_ptr0 += stride;
din_ptr1 += stride;
dout_val = vmaxq_f32(din3, dout_val);
din_ptr2 += stride;
din_ptr3 += stride;
dout_val = vmaxq_f32(tmp, dout_val);
}
din_ptr += width;
for (int h = 0; h < remain_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
dout_val = vmaxq_f32(din0, dout_val);
din_ptr0 += width;
}
vst1q_f32(dout_ptr, dout_val);
dout_ptr += 4;
}
const float* din_ptr00 = din_ptr + cnt_w * 4;
for (int w = 0; w < remain_w; w++) {
const float* din_ptr0 = din_ptr00 + w;
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
*dout_ptr += din_ptr0[0];
*dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
float tmp = std::max(din_ptr1[0], din_ptr2[0]);
din_ptr0 += stride;
din_ptr1 += stride;
*dout_ptr = std::max(*dout_ptr, din_ptr3[0]);
din_ptr2 += stride;
din_ptr3 += stride;
*dout_ptr = std::max(*dout_ptr, tmp);
}
for (int h = 0; h < remain_h; h++) {
*dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
din_ptr0 += width;
}
dout_ptr++;
}
}
}
......
......@@ -11,10 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
nv_library(cuda_gru_forward SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
nv_library(cuda_sequence2batch SRCS sequence2batch.cu DEPS ${cuda_static_deps})
nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
set (
math_cuda
......@@ -25,10 +28,13 @@ set (
cuda_transpose
cuda_elementwise
cudnn_pool
cuda_gru_forward
cuda_sequence2batch
cuda_gemm
cuda_batched_gemm
cuda_strided_gemm
cuda_sequence_padding
cuda_bias
)
set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <iostream>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/activation.h"
#include "lite/backends/cuda/math/utils.h"
......@@ -21,6 +22,20 @@ namespace lite {
namespace cuda {
namespace math {
ActivationType GetActiveType(const std::string& act) {
if (act == "sigmoid") {
return kSigmoid;
} else if (act == "relu") {
return kReLU;
} else if (act == "tanh") {
return kTanh;
} else if (act == "identify") {
return kIdentity;
} else {
LOG(FATAL) << "not supported activation: " << act;
}
}
template <typename T>
__global__ void relu_kernel(const int num,
const float alpha,
......@@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t);
template void bias_relu(
int, const float*, const float* bias, float*, float, cudaStream_t);
// ------------- sigmoid -------------
template <typename T>
__global__ void sigmoid_kernel(const int num, const T* in, T* out) {
CUDA_KERNEL_LOOP(i, num) {
#if __CUDA_ARCH__ >= 350
out[i] = static_cast<T>(1.0f) /
(static_cast<T>(1.0f) + expf(-1 * __ldg(in + i)));
#else
out[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + expf(-in[i]));
#endif
}
}
template <>
__global__ void sigmoid_kernel(const int num, const half* in, half* out) {
CUDA_KERNEL_LOOP(i, num) {
half tmp = __float2half(1.0f);
#if __CUDA_ARCH__ >= 530
out[i] = __hdiv(
tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.0f), __ldg(in + i)))));
#else
out[i] = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i]))));
#endif
}
}
template <>
__global__ void sigmoid_kernel(const int num, const half2* in, half2* out) {
CUDA_KERNEL_LOOP(i, num) {
half2 tmp = __floats2half2_rn(1.0f, 1.0f);
#if __CUDA_ARCH__ >= 530
out[i] = __h2div(tmp,
__hadd2(tmp,
h2exp(__hmul2(__floats2half2_rn(-1.0f, -1.0f),
__ldg(in + i)))));
#else
out[i].x = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].x))));
out[i].y = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].y))));
#endif
}
}
template <typename T>
void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream) {
sigmoid_kernel<T><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
num, din, dout);
CUDA_POST_KERNEL_CHECK;
}
template <>
void sigmoid(const int num, const half* din, half* dout, cudaStream_t stream) {
if (num % 2 == 0) {
const half2* din2 = reinterpret_cast<const half2*>(din);
half2* dout2 = reinterpret_cast<half2*>(dout);
sigmoid_kernel<
half2><<<CUDA_GET_BLOCKS(num / 2), CUDA_NUM_THREADS, 0, stream>>>(
num / 2, din2, dout2);
} else {
sigmoid_kernel<half><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
num, din, dout);
}
CUDA_POST_KERNEL_CHECK;
}
template void sigmoid(const int num,
const float* din,
float* dout,
cudaStream_t stream);
} // namespace math
} // namespace cuda
} // namespace lite
......
......@@ -17,11 +17,22 @@
#include <cuda_runtime.h>
#include <string>
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
enum ActivationType {
kSigmoid,
kReLU,
kTanh,
kIdentity,
};
ActivationType GetActiveType(const std::string& act);
// fp32 and half
template <typename T>
void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream);
......@@ -72,6 +83,9 @@ void bias_int8_nhwc(int num,
const void* scale,
cudaStream_t stream);
template <typename T>
void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream);
} // namespace math
} // namespace cuda
} // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/bias.h"
#include <iostream>
#include "lite/backends/cuda/cuda_utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
__global__ void RowwiseAddKernel(
const T* a, const T* b, T* c, int width, int num) {
CUDA_KERNEL_LOOP(i, num) {
int h = i / width;
int w = i - h * width;
c[i] = a[i] + b[w];
}
}
template <>
__global__ void RowwiseAddKernel(
const half* a, const half* b, half* c, int width, int num) {
CUDA_KERNEL_LOOP(i, num) {
int h = i / width;
int w = i - h * width;
c[i] = __hadd(a[i], b[w]);
}
}
template <typename T>
void RowwiseAdd<T>::operator()(const T* input,
const T* bias,
T* output,
const int width,
const int count,
const cudaStream_t& stream) {
RowwiseAddKernel<T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
input, bias, output, width, count);
CUDA_POST_KERNEL_CHECK;
}
template struct RowwiseAdd<float>;
template struct RowwiseAdd<half>;
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include "lite/backends/cuda/cuda_utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
struct RowwiseAdd {
void operator()(const T* input,
const T* bias,
T* output,
const int width,
const int count,
const cudaStream_t& stream);
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include "lite/backends/cuda/math/gru_forward.h"
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
/*
* threads(frame_per_block, batch_per_block)
* grid(frame_blocks, batch_blocks)
*/
template <typename T>
__global__ void GruForwardResetOutput(
T* gate_value,
T* reset_output_value,
T* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) return;
gate_value += batch_idx * 3 * frame_size;
reset_output_value += batch_idx * frame_size;
}
T prev_out = 0;
T reset_out_val;
T update_gate_value = gate_value[frame_idx + frame_size * 0];
T reset_gate_value = gate_value[frame_idx + frame_size * 1];
if (prev_output_value) {
if (is_batch) {
prev_output_value += batch_idx * frame_size;
}
prev_out = prev_output_value[frame_idx];
}
if (active_gate == lite::cuda::math::ActivationType::kSigmoid) {
update_gate_value = Sigmoid(update_gate_value);
reset_gate_value = Sigmoid(reset_gate_value);
} else if (active_gate == lite::cuda::math::ActivationType::kReLU) {
update_gate_value = ReLU(update_gate_value);
reset_gate_value = ReLU(reset_gate_value);
} else if (active_gate == lite::cuda::math::ActivationType::kTanh) {
update_gate_value = Tanh(update_gate_value);
reset_gate_value = Tanh(reset_gate_value);
}
reset_out_val = prev_out * reset_gate_value;
gate_value[frame_idx + frame_size * 0] = update_gate_value;
gate_value[frame_idx + frame_size * 1] = reset_gate_value;
reset_output_value[frame_idx] = reset_out_val;
}
template <>
__global__ void GruForwardResetOutput(
half* gate_value,
half* reset_output_value,
half* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) return;
gate_value += batch_idx * 3 * frame_size;
reset_output_value += batch_idx * frame_size;
}
half prev_out = 0;
half reset_out_val;
half update_gate_value = gate_value[frame_idx + frame_size * 0];
half reset_gate_value = gate_value[frame_idx + frame_size * 1];
if (prev_output_value) {
if (is_batch) {
prev_output_value += batch_idx * frame_size;
}
prev_out = prev_output_value[frame_idx];
}
if (active_gate == ActivationType::kSigmoid) {
update_gate_value = Sigmoid(update_gate_value);
reset_gate_value = Sigmoid(reset_gate_value);
} else if (active_gate == ActivationType::kReLU) {
update_gate_value = ReLU(update_gate_value);
reset_gate_value = ReLU(reset_gate_value);
} else if (active_gate == ActivationType::kTanh) {
update_gate_value = Tanh(update_gate_value);
reset_gate_value = Tanh(reset_gate_value);
}
#if __CUDA_ARCH__ >= 530
reset_out_val = __hmul(prev_out, reset_gate_value);
#else
reset_out_val =
__float2half(__half2float(prev_out) * __half2float(reset_gate_value));
#endif
gate_value[frame_idx + frame_size * 0] = update_gate_value;
gate_value[frame_idx + frame_size * 1] = reset_gate_value;
reset_output_value[frame_idx] = reset_out_val;
}
/*
* threads(frame_per_block, batch_per_block)
* grid(frame_blocks, batch_blocks)
*/
template <typename T>
__global__ void GruForwardFinalOutput(
T* gate_value,
T* prev_output_value,
T* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) {
return;
}
gate_value += batch_idx * 3 * frame_size;
output_value += batch_idx * frame_size;
}
T output;
T prev_out = 0;
T update_gate_value = gate_value[frame_idx + frame_size * 0];
T state_frame_value = gate_value[frame_idx + frame_size * 2];
if (prev_output_value) {
if (is_batch) prev_output_value += batch_idx * frame_size;
prev_out = prev_output_value[frame_idx];
}
if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
state_frame_value = Sigmoid(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kReLU) {
state_frame_value = ReLU(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kTanh) {
state_frame_value = Tanh(state_frame_value);
}
if (origin_mode) {
output = update_gate_value * prev_out + state_frame_value -
update_gate_value * state_frame_value;
} else {
output = prev_out - update_gate_value * prev_out +
update_gate_value * state_frame_value;
}
gate_value[frame_idx + frame_size * 2] = state_frame_value;
output_value[frame_idx] = output;
}
template <>
__global__ void GruForwardFinalOutput(
half* gate_value,
half* prev_output_value,
half* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) {
return;
}
gate_value += batch_idx * 3 * frame_size;
output_value += batch_idx * frame_size;
}
half output;
half prev_out = 0;
half update_gate_value = gate_value[frame_idx + frame_size * 0];
half state_frame_value = gate_value[frame_idx + frame_size * 2];
if (prev_output_value) {
if (is_batch) prev_output_value += batch_idx * frame_size;
prev_out = prev_output_value[frame_idx];
}
if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
state_frame_value = Sigmoid(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kReLU) {
state_frame_value = ReLU(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kTanh) {
state_frame_value = Tanh(state_frame_value);
}
if (origin_mode) {
#if __CUDA_ARCH__ >= 530
output =
__hsub(__hadd(__hmul(update_gate_value, prev_out), state_frame_value),
__hmul(update_gate_value, state_frame_value));
#else
output = __float2half(
__half2float(update_gate_value) * __half2float(prev_out) +
__half2float(state_frame_value) -
__half2float(update_gate_value) * __half2float(state_frame_value));
#endif
} else {
#if __CUDA_ARCH__ >= 530
output = prev_out - update_gate_value * prev_out +
update_gate_value * state_frame_value;
output = __hadd(__hsub(prev_out, __hmul(update_gate_value, prev_out)),
__hmul(update_gate_value, state_frame_value));
#else
output = __float2half(
__half2float(prev_out) -
__half2float(update_gate_value) * __half2float(prev_out) +
__half2float(update_gate_value) * __half2float(state_frame_value));
#endif
}
gate_value[frame_idx + frame_size * 2] = state_frame_value;
output_value[frame_idx] = output;
}
template __global__ void GruForwardFinalOutput<float>(
float* gate_value,
float* prev_output_value,
float* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch);
template __global__ void GruForwardResetOutput<float>(
float* gate_value,
float* reset_output_value,
float* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch);
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/activation.h"
#include "lite/core/context.h"
#include "lite/core/target_wrapper.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
template <typename Dtype>
inline __device__ Dtype Sigmoid(const Dtype a) {
const Dtype min = SIGMOID_THRESHOLD_MIN;
const Dtype max = SIGMOID_THRESHOLD_MAX;
Dtype tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-tmp));
}
template <>
inline __device__ half Sigmoid(const half a) {
#if __CUDA_ARCH__ >= 530
const half tmp = __float2half(1.0f);
return __hdiv(tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.f), a))));
#else
return __float2half(1.0f / (expf(__half2float(a) * -1) + 1.0f));
#endif
}
template <typename Dtype>
inline __device__ Dtype ReLU(const Dtype a) {
return a > static_cast<Dtype>(0.f) ? a : static_cast<Dtype>(0.f);
}
template <>
inline __device__ half ReLU(const half a) {
const half tmp = __float2half(0.f);
#if __CUDA_ARCH__ >= 530
return __hgt(a, tmp) ? a : tmp;
#else
return __float2half(__half2float(a) > 0.f ? __half2float(a) : 0.f);
#endif
}
template <typename Dtype>
inline __device__ Dtype Tanh(const Dtype a) {
Dtype tmp = static_cast<Dtype>(-2.0) * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
static_cast<Dtype>(1.0);
}
template <>
inline __device__ half Tanh(const half a) {
#if __CUDA_ARCH__ >= 530
half tmp = __float2half(1.0f);
half numerator = __hmul(__float2half(-2.0f), a);
return __hsub(__hdiv(__float2half(2.0f), __hadd(tmp, hexp(numerator))), tmp);
#else
float tmp = -2.0f * __half2float(a);
return __float2half(2.0f / (1.0f + expf(tmp)) - 1.0f);
#endif
}
template <typename T>
__global__ void GruForwardResetOutput(
T* gate_value,
T* reset_output_value,
T* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch);
template <typename T>
__global__ void GruForwardFinalOutput(
T* gate_value,
T* prev_output_value,
T* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch);
/*
* threads(tile_size, 1)
* grids(frame_blocks, 1)
*/
template <class T, int TiledSize>
__global__ void FastCollectiveGruGate(T* gate_value,
T* prev_output_value,
T* gate_weight,
T* reset_output,
int frame_size,
ActivationType active_node) {
T xt_0 = 0.0f;
T a0 = 0.0f;
T c0 = 0.0f;
T b0[TiledSize];
int col = blockIdx.x * blockDim.x + threadIdx.x;
int tiled_mask = ((1 << TiledSize) - 1);
// tiled matrix multiply using register shift, faster than sm.
if (prev_output_value) {
for (int k = 0; k < (((frame_size - 1) / TiledSize) + 1); ++k) {
a0 = 0;
if ((threadIdx.x + k * TiledSize) < frame_size) {
a0 = prev_output_value[threadIdx.x + (k * TiledSize)];
}
for (int i = 0; i < TiledSize; ++i) {
if (col < frame_size * 2 && (i + k * TiledSize) < frame_size) {
b0[i] = gate_weight[(i + k * TiledSize) * frame_size * 2 + col];
}
}
for (int i = 0; i < TiledSize; ++i) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
#else
c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
#endif
}
}
}
__syncthreads();
if (col < frame_size * 2) {
xt_0 = gate_value[col];
c0 += xt_0;
if (active_node == ActivationType::kSigmoid) {
c0 = Sigmoid(c0);
} else if (active_node == ActivationType::kReLU) {
c0 = ReLU(c0);
} else if (active_node == ActivationType::kTanh) {
c0 = Tanh(c0);
}
gate_value[col] = c0;
if (frame_size <= col && col < frame_size * 2) {
T htp_0 = 0.0;
if (prev_output_value) {
htp_0 = prev_output_value[col - frame_size];
}
reset_output[col - frame_size] = c0 * htp_0;
} else if (col < frame_size) {
gate_value[col] = c0;
}
}
}
template <class T, int TiledSize>
__global__ void FastCollectiveGruOut(T* gate_weight,
T* prev_out_value,
T* output_value,
T* gate_value,
T* reset_value,
int frame_size,
ActivationType active_node,
bool origin_mode) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
T a0 = 0.0f;
T b0[TiledSize];
T c0 = 0.0f;
int tiled_mask = ((1 << TiledSize) - 1);
if (prev_out_value) {
for (int k = 0; k < ((frame_size - 1) / TiledSize + 1); ++k) {
a0 = 0;
if ((threadIdx.x + k * TiledSize) < frame_size) {
a0 = reset_value[threadIdx.x + k * TiledSize];
}
for (int i = 0; i < TiledSize; ++i) {
if (col < frame_size && (i + k * TiledSize) < frame_size) {
b0[i] = gate_weight[(i + k * TiledSize) * frame_size + col];
}
}
for (int i = 0; i < TiledSize; ++i) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
#else
c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
#endif
}
}
}
__syncthreads();
if (col < frame_size) {
T xt_0 = gate_value[col + 2 * frame_size];
T gta_0 = gate_value[col];
T htp_0 = 0;
if (prev_out_value) {
htp_0 = prev_out_value[col];
}
c0 += xt_0;
if (active_node == ActivationType::kSigmoid) {
c0 = Sigmoid(c0);
} else if (active_node == ActivationType::kReLU) {
c0 = ReLU(c0);
} else if (active_node == ActivationType::kTanh) {
c0 = Tanh(c0);
}
gate_value[col + 2 * frame_size] = c0;
if (origin_mode) {
output_value[col] = htp_0 * gta_0 + (1 - gta_0) * c0;
} else {
output_value[col] = c0 * gta_0 + (1 - gta_0) * htp_0;
}
}
}
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
......@@ -22,10 +22,6 @@ namespace lite {
namespace cuda {
namespace math {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void scale_kernel(int count,
const T* in_data,
......@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
template <typename T>
__global__ void scale_kernel(
int count, const T* in_data, T* out_data, const T scale, const T bias) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
}
......@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
}
template <typename T>
void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) {
void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
int thread = 256;
int block = (num + thread - 1) / thread;
scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template <typename T>
......@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
int thread = 256;
int block = (num + thread - 1) / thread;
scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template void scale(int num, const float*, float*, float, cudaStream_t, float);
template void scale(int num, const float*, float*, float, float, cudaStream_t);
template void scale(int num, const float*, float*, float, float);
} // namespace math
......
......@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
cudaStream_t stream);
template <typename T>
void scale(
int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);
template <typename T>
void scale(int num, const T* in, T* out, T scale, T bias = 0);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/sequence2batch.h"
#include "lite/backends/cuda/math/utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
__global__ void CopyMatrixRowsKernel(const T* src,
T* dst,
const uint64_t* index,
int height,
int width,
bool is_src_index) {
int idx = threadIdx.x;
int idy = threadIdx.y;
int row_id = blockDim.y * blockIdx.x + idy;
if (row_id < height) {
int src_idx = is_src_index ? index[row_id] : row_id;
int dst_idx = is_src_index ? row_id : index[row_id];
const T* src_data = src + src_idx * width;
T* dst_data = dst + dst_idx * width;
for (int i = idx; i < width; i += blockDim.x) {
dst_data[i] = src_data[i];
}
}
}
template <typename T>
void CopyMatrixRowsFunctor<T>::operator()(
const lite::Tensor& src,
lite::Tensor* dst,
const std::vector<uint64_t>& index_lod,
bool is_src_index,
const cudaStream_t& stream) {
auto src_dims = src.dims();
auto dst_dims = dst->dims();
CHECK_EQ(src_dims.size(), 2) << "The src must be matrix with rank 2.";
CHECK_EQ(dst_dims.size(), 2) << "The dst must be matrix with rank 2.";
CHECK_EQ(src_dims[1], dst_dims[1])
<< "The width of src and dst must be same.";
int height = dst_dims[0];
int width = dst_dims[1];
const auto* src_data = src.data<T>();
auto* dst_data = dst->template mutable_data<T>(TARGET(kCUDA));
index_tensor_.Resize({static_cast<int64_t>(index_lod.size())});
auto* index_tensor_data = index_tensor_.mutable_data<uint64_t>(TARGET(kCUDA));
TargetWrapperCuda::MemcpyAsync(index_tensor_data,
index_lod.data(),
sizeof(uint64_t) * index_lod.size(),
IoDirection::HtoD,
stream);
dim3 threads(128, 8);
dim3 grids((height + threads.y - 1) / threads.y);
CopyMatrixRowsKernel<T><<<grids, threads, 0, stream>>>(
src_data, dst_data, index_tensor_data, height, width, is_src_index);
CUDA_POST_KERNEL_CHECK;
}
template class CopyMatrixRowsFunctor<float>;
template class CopyMatrixRowsFunctor<half>;
template class LoDTensor2BatchFunctor<float>;
template class LoDTensor2BatchFunctor<half>;
template class Batch2LoDTensorFunctor<float>;
template class Batch2LoDTensorFunctor<half>;
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include <algorithm>
#include <string>
#include <vector>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/context.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
class CopyMatrixRowsFunctor {
public:
// If is_src_index is true, copy the indexed rows of input src to the output
// dst. If is_src_index is false, copy the input src to the indexed of output
// dst. The indexes rows are based on the input index.
void operator()(const lite::Tensor& src,
lite::Tensor* dst,
const std::vector<uint64_t>& index_lod,
bool is_src_index,
const cudaStream_t& stream);
private:
lite::Tensor index_tensor_;
};
template <typename T>
class LoDTensor2BatchFunctor {
// Calculate the length of each sequence and
// sort sequence index by the length.
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
struct SeqInfo {
SeqInfo(size_t start_val, size_t len_val, size_t seq_val)
: start(start_val), length(len_val), seq_idx(seq_val) {}
size_t start;
size_t length;
size_t seq_idx;
};
public:
void operator()(const lite::Tensor& lod_tensor,
lite::Tensor* batch_tensor,
bool is_reverse,
const cudaStream_t& stream) const {
auto lods = lod_tensor.lod();
CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
const auto& lod = lods[0];
std::vector<SeqInfo> seq_info;
for (int seq_id = 0; seq_id < static_cast<int>(lod.size()) - 1; ++seq_id) {
size_t length = lod[seq_id + 1] - lod[seq_id];
seq_info.emplace_back(lod[seq_id], length, seq_id);
}
std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
return a.length > b.length;
});
// Calculate the start position of each batch.
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// max_seqlen = 5,
// batchIndex = {b0, b1, b2, b3, b4}
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
// batch_start_positions[0] = 0
// batch_start_positions[1] = len(b0)
// batch_start_positions[2] = len(b0) + len(b1)
// ...
// seq2batch_idx[12] = {4, 0, 9,
// 5, 1, 10,
// 6, 2, 11,
// 7, 3,
// 8}
// seq_order = {1, 0, 2}, the sort order.
// where 1 is the second sequence,
// 0 is the first sequence,
// 2 is the third sequence.
LoD batch_lods;
batch_lods.emplace_back(std::vector<uint64_t>{0});
batch_lods.emplace_back(std::vector<uint64_t>{0});
batch_lods.emplace_back(std::vector<uint64_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor
size_t max_seqlen = seq_info[0].length;
batch_lods[0].resize(max_seqlen + 1);
// batch_lods[1] is the raw index in the input LoDTensor
batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
// batch_lods[2] is the sort order for the input LoDTensor.
batch_lods[2].resize(seq_info.size());
auto* batch_starts = batch_lods[0].data();
auto* seq2batch_idx = batch_lods[1].data();
batch_starts[0] = 0;
for (size_t n = 0; n < max_seqlen; ++n) {
size_t batch_id = batch_starts[n];
for (size_t i = 0; i < seq_info.size(); ++i) {
size_t seq_len = seq_info[i].length;
size_t start = seq_info[i].start;
if (n < seq_len) {
seq2batch_idx[batch_id] =
is_reverse ? start + seq_len - 1 - n : start + n;
++batch_id;
} else {
break;
}
}
batch_starts[n + 1] = batch_id;
}
auto* seq_order = batch_lods[2].data();
for (size_t i = 0; i < seq_info.size(); ++i) {
seq_order[i] = seq_info[i].seq_idx;
}
batch_tensor->set_lod(batch_lods);
lite::cuda::math::CopyMatrixRowsFunctor<T> to_batch;
to_batch(lod_tensor, batch_tensor, batch_lods[1], true, stream);
CUDA_POST_KERNEL_CHECK;
}
};
template <typename T>
class Batch2LoDTensorFunctor {
public:
void operator()(const lite::Tensor& batch_tensor,
lite::Tensor* lod_tensor,
const cudaStream_t& stream) {
auto in_lod = batch_tensor.lod();
CHECK_GT(in_lod.size(), 2UL) << "The LoD of LoDTensor should include at "
"least 2-level sequence infomation.";
CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
<< "The LoD information should be consistent with the dims.";
lite::cuda::math::CopyMatrixRowsFunctor<T> to_seq;
to_seq(batch_tensor, lod_tensor, in_lod[1], false, stream);
CUDA_POST_KERNEL_CHECK;
}
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
......@@ -86,8 +86,7 @@ void SequencePadding(T* pad_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template <typename T>
......@@ -120,8 +119,7 @@ void SequenceUnpadding(T* seq_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template void SequencePadding(float* pad_data,
......
......@@ -15,6 +15,7 @@
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/target_wrapper.h"
namespace paddle {
......@@ -31,6 +32,16 @@ class TargetWrapper<TARGET(kCUDA)> {
static size_t num_devices();
static size_t maximum_stream() { return 0; }
static int GetComputeCapability() {
int dev_id = GetCurDevice();
int major, minor;
CUDA_CALL(cudaDeviceGetAttribute(
&major, cudaDevAttrComputeCapabilityMajor, dev_id));
CUDA_CALL(cudaDeviceGetAttribute(
&minor, cudaDevAttrComputeCapabilityMinor, dev_id));
return major * 10 + minor;
}
static size_t GetCurDevice() {
int dev_id;
cudaGetDevice(&dev_id);
......
......@@ -19,7 +19,7 @@
namespace paddle {
namespace lite {
const int MALLOC_ALIGN = 64;
const int MALLOC_ALIGN = 16;
void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
......@@ -30,7 +30,6 @@ void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
(~(MALLOC_ALIGN - 1)));
static_cast<void**>(r)[-1] = p;
memset(r, 0, size);
return r;
}
void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {
......
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/huawei_ascend_npu/device.h"
#include <map>
#include <utility>
#include "ge/ge_api_types.h"
#include "ge/ge_ir_build.h"
#include "graph/graph.h"
#include "lite/utils/io.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
std::shared_ptr<AclModelClient> Device::LoadFromMem(
const std::vector<char>& model_buffer, const int device_id) {
if (model_buffer.size() == 0) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!";
return nullptr;
}
// Create a ACL model client to load the om model
std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
// Load model from memory
if (model_client->LoadFromMem(
reinterpret_cast<const void*>(model_buffer.data()),
model_buffer.size())) {
return model_client;
}
return nullptr;
}
std::shared_ptr<AclModelClient> Device::LoadFromFile(
const std::string& model_path, const int device_id) {
if (!paddle::lite::IsFileExists(model_path)) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path;
return nullptr;
}
// Create a ACL model client to load the om model
std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
// Load model from memory
if (model_client->LoadFromFile(model_path.c_str())) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
return model_client;
}
return nullptr;
}
std::mutex Device::device_mutex_;
bool Device::Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer) {
std::lock_guard<std::mutex> lock(device_mutex_);
// Convert the HiAI IR graph to the HiAI om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
// Build IR model
ge::ModelBufferData om_buffer;
std::map<std::string, std::string> options;
options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error"));
ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer));
// Copy from om model buffer
model_buffer->resize(om_buffer.length);
memcpy(reinterpret_cast<void*>(model_buffer->data()),
reinterpret_cast<void*>(om_buffer.data.get()),
om_buffer.length);
return true;
}
void Device::InitOnce() {
if (runtime_inited_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!";
return;
}
// ACL runtime init => can only be called once in one process
ACL_CALL(aclInit(NULL));
// ATC builder init => can only be called once in one process
std::map<std::string, std::string> global_options;
global_options.insert(
std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310"));
ATC_CALL(ge::aclgrphBuildInitialize(global_options));
runtime_inited_ = true;
}
void Device::DestroyOnce() {
if (!runtime_inited_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!";
return;
}
// ATC builder finalize => can only be called once in one process
ge::aclgrphBuildFinalize();
// ACL runtime finalize => can only be called once in one process
ACL_CALL(aclFinalize());
runtime_inited_ = false;
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <vector>
#include "lite/backends/huawei_ascend_npu/model_client.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() { InitOnce(); }
~Device() { DestroyOnce(); }
std::shared_ptr<AclModelClient> LoadFromMem(
const std::vector<char>& model_buffer, const int device_id);
std::shared_ptr<AclModelClient> LoadFromFile(const std::string& model_path,
const int device_id);
// Build the ACL IR graph to the ACL om model
bool Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer); // NOLINT
private:
void InitOnce();
void DestroyOnce();
bool runtime_inited_{false};
static std::mutex device_mutex_;
};
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/huawei_ascend_npu/model_client.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
bool AclModelClient::LoadFromMem(const void* data, uint32_t size) {
if (load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
return true;
}
auto ret = aclmdlQuerySizeFromMem(
data, size, &model_memory_size_, &model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!";
return false;
}
ret = aclrtMalloc(
&model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
"failed, require size is "
<< model_memory_size_;
return false;
}
ret = aclrtMalloc(
&model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
"failed, require size is "
<< model_weight_size_;
return false;
}
ret = aclmdlLoadFromMemWithMem(data,
size,
&model_id_,
model_memory_ptr_,
model_memory_size_,
model_weight_ptr_,
model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
return false;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
return false;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success.";
load_flag_ = true;
return true;
}
bool AclModelClient::LoadFromFile(const char* model_path) {
if (load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
return true;
}
auto ret =
aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!";
return false;
}
ret = aclrtMalloc(
&model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
"failed, require size is "
<< model_memory_size_;
return false;
}
ret = aclrtMalloc(
&model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
"failed, require size is "
<< model_weight_size_;
return false;
}
ret = aclmdlLoadFromFileWithMem(model_path,
&model_id_,
model_memory_ptr_,
model_memory_size_,
model_weight_ptr_,
model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!";
return false;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
return false;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
load_flag_ = true;
return true;
}
bool AclModelClient::GetModelIOTensorDim(
std::vector<TensorDesc>* input_tensor,
std::vector<TensorDesc>* output_tensor) {
if (!model_desc_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!";
return false;
}
size_t input_num = aclmdlGetNumInputs(model_desc_);
VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num;
for (size_t i = 0; i < input_num; i++) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ....";
aclmdlIODims input_dim;
aclmdlGetInputDims(model_desc_, i, &input_dim);
aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is "
<< data_type;
aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is "
<< data_format;
TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
input_tensor->push_back(tensor_desc);
}
size_t output_num = aclmdlGetNumOutputs(model_desc_);
VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num;
for (size_t i = 0; i < output_num; i++) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ....";
aclmdlIODims output_dim;
aclmdlGetOutputDims(model_desc_, i, &output_dim);
aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is "
<< data_type;
aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is "
<< data_format;
TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
output_tensor->push_back(tensor_desc);
}
return true;
}
bool AclModelClient::GetTensorFromDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
if (device_output_num != tensor_output_num) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
<< device_output_num << "tensor number is " << tensor_output_num;
return false;
}
for (size_t i = 0; i < device_output_num; i++) {
aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
void* device_data = aclGetDataBufferAddr(buffer_device);
uint32_t device_size = aclGetDataBufferSize(buffer_device);
void* tensor_data = nullptr;
aclError ret = aclrtMallocHost(&tensor_data, device_size);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret;
return false;
}
ret = aclrtMemcpy(tensor_data,
device_size,
device_data,
device_size,
ACL_MEMCPY_DEVICE_TO_HOST);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret;
return false;
}
if (output_tensor->at(i)->SetData(reinterpret_cast<uint8_t*>(tensor_data),
device_size) != ge::GRAPH_SUCCESS) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed";
return false;
}
}
VLOG(3)
<< "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed.";
return true;
}
void AclModelClient::CreateInputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor) {
input_dataset_ = aclmdlCreateDataset();
if (input_dataset_ == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!";
return;
}
for (size_t i = 0; i < input_tensor->size(); i++) {
auto item = input_tensor->at(i);
size_t buffer_size = item->GetSize();
void* buffer_device = nullptr;
aclError ret =
aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is "
<< buffer_size;
return;
}
void* buffer_data = reinterpret_cast<void*>(item->GetData());
ret = aclrtMemcpy(buffer_device,
buffer_size,
buffer_data,
buffer_size,
ACL_MEMCPY_HOST_TO_DEVICE);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is "
<< buffer_size;
aclrtFree(buffer_device);
return;
}
aclDataBuffer* data_buffer =
aclCreateDataBuffer(buffer_device, buffer_size);
if (data_buffer == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
aclrtFree(buffer_device);
return;
}
if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!";
aclrtFree(buffer_device);
aclDestroyDataBuffer(data_buffer);
return;
}
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed.";
}
void AclModelClient::CreateOutputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
output_dataset_ = aclmdlCreateDataset();
if (output_dataset_ == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!";
return;
}
size_t output_size = aclmdlGetNumOutputs(model_desc_);
CHECK_EQ(output_size, output_tensor->size());
for (size_t i = 0; i < output_size; i++) {
size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
void* buffer_device = nullptr;
aclError ret =
aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is "
<< buffer_size;
return;
}
aclDataBuffer* data_buffer =
aclCreateDataBuffer(buffer_device, buffer_size);
if (data_buffer == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
aclrtFree(buffer_device);
return;
}
if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) !=
ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!";
aclrtFree(buffer_device);
aclDestroyDataBuffer(data_buffer);
return;
}
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed.";
}
bool AclModelClient::ModelExecute(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
// check model exists
if (model_desc_ == nullptr) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] no model description, model execution failed!";
return false;
}
// create input/output dataset
CreateInputDataset(input_tensor);
CreateOutputDataset(output_tensor);
// model execution
ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_));
// get output
if (!GetTensorFromDataset(output_tensor)) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:"
<< model_id_;
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
<< model_id_;
return true;
}
void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
if (*dataset == nullptr) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!";
return;
}
size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset);
for (size_t i = 0; i < dataset_num; i++) {
aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i);
void* device_data = aclGetDataBufferAddr(buffer_device);
if (device_data == nullptr) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!";
} else {
if (aclrtFree(device_data) != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!";
}
}
if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!";
}
}
if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!";
}
*dataset = nullptr;
VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
}
bool AclModelClient::UnloadModel() {
if (!load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
<< load_flag_;
return true;
}
DestroyDataset(&input_dataset_);
DestroyDataset(&output_dataset_);
aclError ret = aclmdlUnload(model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "unload model failed, model id is " << model_id_;
return false;
}
if (model_desc_ != nullptr) {
(void)aclmdlDestroyDesc(model_desc_);
model_desc_ = nullptr;
}
if (model_memory_ptr_ != nullptr) {
aclrtFree(model_memory_ptr_);
model_memory_ptr_ = nullptr;
model_memory_size_ = 0;
}
if (model_weight_ptr_ != nullptr) {
aclrtFree(model_weight_ptr_);
model_weight_ptr_ = nullptr;
model_weight_size_ = 0;
}
load_flag_ = false;
VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
return true;
}
uint32_t AclModelClient::num_devices() {
uint32_t count = 0;
ACL_CALL(aclrtGetDeviceCount(&count));
return count;
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/huawei_ascend_npu/utils.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
class TensorDesc {
public:
TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
if (format == ACL_FORMAT_NHWC) {
dim_order[1] = 3;
dim_order[2] = 1;
dim_order[3] = 2;
}
// create ge::Tensordesc
ge_tensor_desc_ = new ge::TensorDesc(
GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
CHECK(ge_tensor_desc_ != nullptr);
}
~TensorDesc() { ge_tensor_desc_ = nullptr; }
int64_t GetNumber() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[0]);
}
int64_t GetChannel() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[1]);
}
int64_t GetHeight() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[2]);
}
int64_t GetWidth() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[3]);
}
const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; }
private:
ge::Shape GetGeShape(aclmdlIODims dims) {
ge::Shape ge_shape({0, 0, 0, 0});
for (size_t i = 0; i < dims.dimCount; i++) {
if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!";
} else {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <"
<< dims.dims[i] << ">";
}
}
return ge_shape;
}
ge::Format GetGeFormat(aclFormat format) {
ge::Format ge_format = ge::FORMAT_NCHW;
switch (format) {
case ACL_FORMAT_NCHW:
ge_format = ge::FORMAT_NCHW;
break;
case ACL_FORMAT_NHWC:
ge_format = ge::FORMAT_NHWC;
break;
case ACL_FORMAT_ND:
ge_format = ge::FORMAT_ND;
break;
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format;
break;
}
return ge_format;
}
ge::DataType GetGeDataType(aclDataType data_type) {
ge::DataType ge_datatype = ge::DT_FLOAT;
switch (data_type) {
case ACL_FLOAT:
ge_datatype = ge::DT_FLOAT;
break;
case ACL_FLOAT16:
ge_datatype = ge::DT_FLOAT16;
break;
case ACL_INT8:
ge_datatype = ge::DT_INT8;
break;
case ACL_INT16:
ge_datatype = ge::DT_INT16;
break;
case ACL_INT32:
ge_datatype = ge::DT_INT32;
break;
case ACL_INT64:
ge_datatype = ge::DT_INT64;
break;
case ACL_BOOL:
ge_datatype = ge::DT_BOOL;
break;
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!";
break;
}
return ge_datatype;
}
private:
ge::TensorDesc* ge_tensor_desc_{nullptr};
// n c h w order, default to ACL_FORMAT_NCHW
std::vector<size_t> dim_order{0, 1, 2, 3};
};
class AclModelClient {
public:
explicit AclModelClient(int device_id) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: "
<< device_id;
device_num_ = num_devices();
if (device_id < 0 || device_id >= device_num_) {
LOG(FATAL) << "Failed with invalid device id " << device_id;
return;
}
device_id_ = device_id;
ACL_CALL(aclrtSetDevice(device_id_));
}
~AclModelClient() {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
<< device_id_;
ACL_CALL(aclrtResetDevice(device_id_));
}
bool LoadFromMem(const void* data, uint32_t size);
bool LoadFromFile(const char* model_path);
bool GetModelIOTensorDim(std::vector<TensorDesc>* input_tensor,
std::vector<TensorDesc>* output_tensor);
bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
bool UnloadModel();
private:
void CreateInputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor);
void CreateOutputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
bool GetTensorFromDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
void DestroyDataset(aclmdlDataset** dataset);
private:
uint32_t num_devices();
private:
int device_id_{0};
int device_num_{0};
aclrtContext context_{nullptr};
bool load_flag_{false};
uint32_t model_id_{0};
size_t model_memory_size_;
size_t model_weight_size_;
void* model_memory_ptr_;
void* model_weight_ptr_;
aclmdlDesc* model_desc_{nullptr};
aclmdlDataset* input_dataset_{nullptr};
aclmdlDataset* output_dataset_{nullptr};
};
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "acl/acl.h"
#include "ge/ge_api_types.h"
#include "ge/ge_ir_build.h"
#include "graph/ge_error_codes.h"
#include "graph/graph.h"
#include "graph/tensor.h"
#include "graph/types.h"
#include "lite/utils/cp_logging.h"
/*
* This file contains some Huawei Ascend NPU specific uitls.
*/
#define ACL_CALL(msg) \
CHECK_EQ(reinterpret_cast<aclError>(msg), ACL_ERROR_NONE) \
<< (msg) << " Huawei Ascend NPU ACL Error: " \
<< ::paddle::lite::huawei_ascend_npu::AclErrorInfo( \
reinterpret_cast<int>(msg))
#define ATC_CALL(msg) \
CHECK_EQ(reinterpret_cast<ge::graphStatus>(msg), ge::GRAPH_SUCCESS) \
<< (msg) << " Huawei Ascend NPU ATC Error: " \
<< ::paddle::lite::huawei_ascend_npu::AtcErrorInfo( \
reinterpret_cast<uint32_t>(msg))
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
static const char* AtcErrorInfo(uint32_t error) {
switch (error) {
#define LITE_ATC_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED); // 0xFFFFFFFF
LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID); // 50331649
#undef LITE_ATC_ERROR_INFO
default:
return "unknown error";
break;
}
}
static const char* AclErrorInfo(int error) {
switch (error) {
#define LITE_ACL_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM); // 100000
LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE); // 100001
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE); // 100002
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE); // 100003
LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE); // 100004
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE); // 100005
LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE); // 100006
LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR); // 100007
LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID); // 100008
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG); // 100009
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG); // 100010
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID); // 100011
LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL); // 100012
LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL); // 100013
LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE); // 100014
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID); // 100015
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR); // 100016
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH); // 100017
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH); // 100018
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC); // 100019
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH); // 100020
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH); // 100021
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH); // 100022
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH); // 100023
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND); // 100024
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED); // 100025
LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE); // 100026
LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH); // 100027
LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED); // 100028
LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND); // 100029
LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED); // 100030
LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED); // 100031
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID); // 100032
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE); // 100033
LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE); // 100034
LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE); // 100035
LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT); // 100036
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE); // 100037
LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP); // 100038
LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC); // 200000
LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT); // 200001
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE); // 200002
LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED); // 200003
LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH); // 200004
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE); // 200005
LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED); // 200006
LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT); // 300000
LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR); // 500000
LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE); // 500001
LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE); // 500002
LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE); // 500003
LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE); // 500004
LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE); // 500005
#undef LITE_ACL_ERROR_INFO
default:
return "unknown error";
break;
}
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
......@@ -33,7 +33,7 @@ std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
// Check HiAI DDK version
const char* ddk_version = model_client->GetVersion();
if (ddk_version) {
LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version;
} else {
LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
}
......
......@@ -38,17 +38,20 @@ CLRuntime::~CLRuntime() {
}
bool CLRuntime::Init() {
if (initialized_) {
if (is_cl_runtime_initialized_) {
return true;
}
bool is_platform_init = InitializePlatform();
bool is_device_init = InitializeDevice();
is_init_success_ = is_platform_init && is_device_init;
initialized_ = true;
context_ = CreateContext();
command_queue_ = CreateCommandQueue(context());
return initialized_;
LOG(INFO) << "is_platform_init:" << is_platform_init;
LOG(INFO) << "is_device_init:" << is_device_init;
if ((is_platform_init == true) && (is_device_init == true)) {
is_platform_device_init_success_ = true;
context_ = CreateContext();
command_queue_ = CreateCommandQueue(context());
is_cl_runtime_initialized_ = true;
}
return is_cl_runtime_initialized_;
}
cl::Platform& CLRuntime::platform() {
......@@ -64,7 +67,9 @@ cl::Context& CLRuntime::context() {
}
cl::Device& CLRuntime::device() {
CHECK(device_ != nullptr) << "device_ is not initialized!";
if (device_ == nullptr) {
LOG(ERROR) << "device_ is not initialized!";
}
return *device_;
}
......@@ -150,6 +155,14 @@ GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) {
}
bool CLRuntime::InitializeDevice() {
VLOG(3) << "device_info_.size():" << device_info_.size();
for (auto i : device_info_) {
VLOG(3) << ">>> " << i.first << " " << i.second;
}
if (device_info_.size() > 0 && device_info_.size() <= 2) {
return false;
}
device_info_["PLACEHOLDER"] = 1;
// ===================== BASIC =====================
// CL_DEVICE_TYPE_GPU
// CL_DEVICE_NAME
......@@ -160,7 +173,7 @@ bool CLRuntime::InitializeDevice() {
status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
CL_CHECK_ERROR(status_);
if (all_devices.empty()) {
LOG(FATAL) << "No OpenCL GPU device found!";
LOG(ERROR) << "No available OpenCL GPU device found!";
return false;
}
device_ = std::make_shared<cl::Device>();
......@@ -313,9 +326,6 @@ bool CLRuntime::InitializeDevice() {
}
std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
if (0 != device_info_.size()) {
return device_info_;
}
InitializeDevice();
return device_info_;
}
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/backends/opencl/cl_wrapper.h"
typedef enum {
UNKNOWN = 0,
......@@ -68,6 +69,28 @@ class CLRuntime {
public:
static CLRuntime* Global();
bool OpenCLAvaliableForDevice() {
bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
LOG(INFO) << "opencl_lib_found:" << opencl_lib_found;
if (opencl_lib_found == false) return false;
bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
LOG(INFO) << "dlsym_success:" << dlsym_success;
if (opencl_lib_found == false) return false;
InitializeDevice();
bool support_fp16 =
static_cast<bool>(device_info_["CL_DEVICE_EXTENSIONS_FP16"]);
LOG(INFO) << "support_fp16:" << support_fp16;
if (support_fp16 == false) return false;
is_device_avaliable_for_opencl_ =
dlsym_success && opencl_lib_found && support_fp16;
LOG(INFO) << "is_device_avaliable_for_opencl_:"
<< is_device_avaliable_for_opencl_;
return is_device_avaliable_for_opencl_;
}
bool Init();
cl::Platform& platform();
......@@ -85,7 +108,7 @@ class CLRuntime {
bool BuildProgram(cl::Program* program, const std::string& options = "");
bool IsInitSuccess() { return is_init_success_; }
bool IsInitSuccess() { return is_platform_device_init_success_; }
std::string cl_path() { return cl_path_; }
......@@ -167,9 +190,11 @@ class CLRuntime {
cl_int status_{CL_SUCCESS};
bool initialized_{false};
bool is_device_avaliable_for_opencl_{false};
bool is_cl_runtime_initialized_{false};
bool is_init_success_{false};
bool is_platform_device_init_success_{false};
};
} // namespace lite
......
......@@ -19,14 +19,16 @@ limitations under the License. */
namespace paddle {
namespace lite {
CLWrapper *CLWrapper::Global() {
static CLWrapper wrapper;
return &wrapper;
}
CLWrapper::CLWrapper() {
CHECK(InitHandle()) << "Fail to initialize the OpenCL library!";
InitFunctions();
opencl_lib_found_ = InitHandle();
CHECK(opencl_lib_found_) << "Fail to initialize the OpenCL library!";
dlsym_success_ = InitFunctions();
}
bool CLWrapper::InitHandle() {
......@@ -68,15 +70,17 @@ bool CLWrapper::InitHandle() {
}
}
void CLWrapper::InitFunctions() {
bool CLWrapper::InitFunctions() {
CHECK(handle_ != nullptr) << "The library handle can't be null!";
bool dlsym_success = true;
#define PADDLE_DLSYM(cl_func) \
do { \
cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \
if (cl_func##_ == nullptr) { \
LOG(FATAL) << "Cannot find the " << #cl_func \
LOG(ERROR) << "Cannot find the " << #cl_func \
<< " symbol in libOpenCL.so!"; \
dlsym_success = false; \
break; \
} \
VLOG(4) << "Loaded the " << #cl_func << " symbol successfully."; \
......@@ -137,6 +141,7 @@ void CLWrapper::InitFunctions() {
PADDLE_DLSYM(clEnqueueCopyImage);
#undef PADDLE_DLSYM
return dlsym_success;
}
} // namespace lite
......
......@@ -508,13 +508,20 @@ class CLWrapper final {
return clEnqueueCopyImage_;
}
bool OpenclLibFound() { return opencl_lib_found_; }
bool DlsymSuccess() { return dlsym_success_; }
private:
CLWrapper();
CLWrapper(const CLWrapper &) = delete;
CLWrapper &operator=(const CLWrapper &) = delete;
bool InitHandle();
void InitFunctions();
bool InitFunctions();
bool opencl_lib_found_{true};
bool dlsym_success_{true};
void *handle_{nullptr};
clGetPlatformIDsType clGetPlatformIDs_{nullptr};
clGetPlatformInfoType clGetPlatformInfo_{nullptr};
clBuildProgramType clBuildProgram_{nullptr};
......
......@@ -19,7 +19,7 @@
#include <memory>
#include <string>
#include <type_traits>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/backends/xpu/target_wrapper.h"
namespace paddle {
namespace lite {
......@@ -82,8 +82,8 @@ void DumpXPUMem(const T* ptr,
size_t item_per_line = 30) {
size_t after_stride_len = (len + stride - 1) / stride;
std::unique_ptr<T[]> cpu_mem(new T[len]);
xpu_memcpy(
cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
XPU_CALL(xpu_memcpy(
cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST));
std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
for (size_t i = 0; i < after_stride_len; ++i) {
after_stride[i] = cpu_mem[i * stride];
......
......@@ -19,11 +19,11 @@ namespace lite {
void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr};
xpu_malloc(&ptr, size);
XPU_CALL(xpu_malloc(&ptr, size));
return ptr;
}
void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
void TargetWrapperXPU::Free(void* ptr) { XPU_CALL(xpu_free(ptr)); }
void TargetWrapperXPU::MemcpySync(void* dst,
const void* src,
......@@ -31,10 +31,10 @@ void TargetWrapperXPU::MemcpySync(void* dst,
IoDirection dir) {
switch (dir) {
case IoDirection::HtoD:
xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
break;
case IoDirection::DtoH:
xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
......@@ -49,7 +49,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
} else {
ptr = TargetWrapperXPU::Malloc(size);
}
CHECK(ptr != nullptr);
CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
}
......
......@@ -16,11 +16,23 @@
#include <memory> // std::unique_ptr
#include "lite/backends/xpu/xpu_header_sitter.h" // xpu_free
#include "lite/core/target_wrapper.h"
#include "lite/core/target_wrapper.h" // TargetWrapper
#include "lite/utils/cp_logging.h" // CHECK_EQ
#define XPU_CALL(func) \
{ \
auto e = (func); \
CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
}
namespace paddle {
namespace lite {
// MAX(lod.size()) = 64
const int XPU_MAX_LOD_SIZE = 64;
// MAX(lod[i + 1] - lod[i]) = 512
const int XPU_MAX_LOD_SEQ_LEN = 512;
using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
struct XPUScratchPad {
......@@ -33,7 +45,7 @@ struct XPUScratchPad {
struct XPUScratchPadDeleter {
void operator()(XPUScratchPad* sp) const {
if (!sp->is_l3_) {
xpu_free(sp->addr_);
XPU_CALL(xpu_free(sp->addr_));
}
delete sp;
}
......@@ -55,7 +67,7 @@ class TargetWrapper<TARGET(kXPU)> {
size_t size,
IoDirection dir);
static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = true);
static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = false);
static xdnn::Context* GetRawContext() {
if (tls_raw_ctx_ == nullptr) {
......@@ -77,11 +89,10 @@ class TargetWrapper<TARGET(kXPU)> {
static void SetDev(int dev_no = 0) {
const char* dev_env = getenv("LITE_XPU_DEV");
if (dev_env) {
xpu_set_device(atoi(dev_env));
return;
dev_no = atoi(dev_env);
}
xpu_set_device(dev_no);
XPU_CALL(xpu_set_device(dev_no));
}
static std::string multi_encoder_precision; // NOLINT
......
......@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -24,7 +24,7 @@ namespace arena {
void TestCase::CreateInstruction() {
std::shared_ptr<lite::OpLite> op = nullptr;
static const std::set<TargetType> subgraph_op_supported_targets(
{TARGET(kNPU), TARGET(kXPU)});
{TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)});
bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
subgraph_op_supported_targets.end();
#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
......@@ -32,25 +32,35 @@ void TestCase::CreateInstruction() {
#endif
if (enable_subgraph_op) {
// Create a new block desc to wrap the original op desc
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
int sub_block_idx = 0;
auto sub_block_desc = new cpp::BlockDesc();
auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
sub_block_desc->ClearOps();
sub_block_desc->ClearVars();
auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_block_op_desc = *op_desc_;
auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_op_desc = *op_desc_;
// Add the block desc into the subgraph op which used to replace the
// original op
op_desc_.reset(new cpp::OpDesc());
op_desc_->SetType("subgraph");
op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
auto in_names = sub_block_op_desc->input_vars();
auto out_names = sub_block_op_desc->output_vars();
auto in_names = sub_op_desc->input_vars();
auto out_names = sub_op_desc->output_vars();
op_desc_->SetInput("Inputs", in_names);
op_desc_->SetOutput("Outputs", out_names);
op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
// filter only data op (not const op by persisiable)
std::vector<std::string> in_data_names;
for (auto name : in_names) {
if (!(inst_scope_->FindTensor(name)->persistable())) {
in_data_names.push_back(name);
}
}
op_desc_->SetAttr<std::vector<std::string>>("input_data_names",
in_data_names);
op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
op = LiteOpRegistry::Global().Create(op_desc().Type());
static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
sub_program_desc);
} else {
op = LiteOpRegistry::Global().Create(op_desc().Type());
}
......@@ -60,7 +70,7 @@ void TestCase::CreateInstruction() {
// filter out the target kernel
CHECK(!kernels.empty()) << "No kernel found for place "
<< place_.DebugString();
auto it = std::remove_if(
auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
return k->alias() == alias_;
});
......@@ -234,19 +244,6 @@ bool TestCase::CheckPrecision(const std::string& var_name,
return success;
}
TestCase::~TestCase() {
if (op_desc_->Type() == "subgraph") {
// Release the subblock desc of Subgraph op
auto subgraph_op = const_cast<operators::SubgraphOp*>(
static_cast<const operators::SubgraphOp*>(instruction_->op()));
CHECK(subgraph_op);
auto sub_block_desc = subgraph_op->GetSubBlock();
if (sub_block_desc) {
delete sub_block_desc;
}
}
}
} // namespace arena
} // namespace lite
} // namespace paddle
......@@ -46,7 +46,7 @@ class TestCase {
base_scope_(new Scope) {
ctx_ = ContextScheduler::Global().NewContext(place_.target);
}
virtual ~TestCase();
virtual ~TestCase() {}
void Prepare() {
PrepareData();
......
......@@ -17,8 +17,13 @@
namespace paddle {
namespace lite {
#ifdef LITE_WITH_NPU
std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""}; // NOLINT
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
thread_local std::string
Context<TargetType::kHuaweiAscendNPU>::subgraph_model_cache_dir_{
""}; // NOLINT
thread_local int
Context<TargetType::kHuaweiAscendNPU>::huawei_ascend_device_id_{
0}; // NOLINT
#endif
#ifdef LITE_WITH_MLU
......
......@@ -39,6 +39,7 @@
#include <utility>
#include <vector>
#include "lite/core/device_info.h"
#include "lite/core/scope.h"
#include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h"
#include "lite/utils/all.h"
......@@ -61,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>;
using BMContext = Context<TargetType::kBM>;
using MLUContext = Context<TargetType::kMLU>;
using RKNPUContext = Context<TargetType::kRKNPU>;
using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;
template <>
class Context<TargetType::kHost> {
......@@ -84,6 +86,35 @@ class Context<TargetType::kNPU> {
NPUContext& operator=(const NPUContext& ctx) {}
std::string name() const { return "NPUContext"; }
static void SetSubgraphModelCacheDir(Scope* scope,
std::string subgraph_model_cache_dir) {
auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
CHECK(var);
auto data = var->GetMutable<std::string>();
CHECK(data);
*data = subgraph_model_cache_dir;
}
static std::string SubgraphModelCacheDir(Scope* scope) {
auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
if (!var) return "";
return var->Get<std::string>();
}
};
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
template <>
class Context<TargetType::kHuaweiAscendNPU> {
public:
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(HuaweiAscendNPUContext* ctx) {}
HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) {
return *this;
}
std::string name() const { return "HuaweiAscendNPUContext"; }
static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
subgraph_model_cache_dir_ = subgraph_model_cache_dir;
}
......@@ -91,8 +122,14 @@ class Context<TargetType::kNPU> {
return subgraph_model_cache_dir_;
}
static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) {
huawei_ascend_device_id_ = huawei_ascend_device_id;
}
static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; }
private:
static std::string subgraph_model_cache_dir_;
static thread_local std::string subgraph_model_cache_dir_;
static thread_local int huawei_ascend_device_id_;
};
#endif
......@@ -385,6 +422,13 @@ class ContextScheduler {
&ctx->As<NPUContext>());
break;
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
case TARGET(kHuaweiAscendNPU):
kernel_contexts_[TargetType::kHuaweiAscendNPU]
.As<HuaweiAscendNPUContext>()
.CopySharedTo(&ctx->As<HuaweiAscendNPUContext>());
break;
#endif
#ifdef LITE_WITH_APU
case TARGET(kAPU):
kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
......@@ -466,6 +510,9 @@ class ContextScheduler {
#ifdef LITE_WITH_NPU
InitContext<TargetType::kNPU, NPUContext>();
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
InitContext<TargetType::kHuaweiAscendNPU, HuaweiAscendNPUContext>();
#endif
#ifdef LITE_WITH_APU
InitContext<TargetType::kAPU, APUContext>();
#endif
......
......@@ -18,6 +18,7 @@ lite_cc_library(mir_passes
fusion/conv_activation_fuse_pass.cc
fusion/var_conv_2d_activation_fuse_pass.cc
fusion/conv_bn_fuse_pass.cc
fusion/conv_conv_fuse_pass.cc
fusion/elementwise_add_activation_fuse_pass.cc
fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc
......@@ -32,6 +33,7 @@ lite_cc_library(mir_passes
elimination/identity_dropout_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc
elimination/remove_tf_redundant_ops_pass.cc
elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
type_target_cast_pass.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
#include <algorithm>
#include <list>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
// Remove all of the unused nodes from the contorl flow op and update the inputs
// and outputs of the op info The unused nodes are defined as the nodes which
// are only linked to the control flow op nodes but nerver linked to the other
// op nodes.
//
// For example:
// graph[0]: main block
// in_x
// in_f | in_z(unused node)
// \ | /
// \ | /
// in_w ------- while ------- in_y(unused_node)
// / |
// / |
// (unused node)out_y |
// out_x
//
// graph[1]: sub block
// in_x
// |
// |
// conv2d----in_f
// |
// |
// fc ------in_w
// |
// |
// softmax
// |
// |
// out_x
//
// After the pass is applied:
// in_x
// in_f |
// \ |
// \ |
// in_w ------- while
// |
// |
// |
// out_x
// Remove the var node from var2rm if it is recursively referred to any op in
// the subblock
void CollectUnusedInputOutputNodes(
int block_idx,
std::vector<std::unique_ptr<mir::SSAGraph>>* graphs,
const std::unordered_set<std::string>& control_flow_op_types,
std::unordered_map<std::string, Node*>* in_vars2rm,
std::unordered_map<std::string, Node*>* out_vars2rm) {
auto block_size = graphs->size();
for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) {
if (!op_node->IsStmt()) continue;
auto op_info = op_node->AsStmt().op_info();
auto op_type = op_info->Type();
if (control_flow_op_types.count(op_type)) {
int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
CHECK(block_idx >= 0 && block_idx < block_size);
CollectUnusedInputOutputNodes(sub_block_idx,
graphs,
control_flow_op_types,
in_vars2rm,
out_vars2rm);
} else {
for (auto& var_node : op_node->inlinks) {
auto& var_name = var_node->AsArg().name;
if (in_vars2rm->count(var_name)) {
in_vars2rm->erase(var_name);
}
}
for (auto& var_node : op_node->outlinks) {
auto& var_name = var_node->AsArg().name;
// Tensor array may be only used as the output vars in the sublock
if (in_vars2rm->count(var_name)) {
in_vars2rm->erase(var_name);
}
if (out_vars2rm->count(var_name)) {
out_vars2rm->erase(var_name);
}
}
}
}
}
// Remove the unused var nodes from the graph and update the op_info of the
// control flow op
void RemoveNodesFromGraphAndUpdateOpInfo(
SSAGraph* graph,
Node* op_node,
const std::unordered_map<std::string, Node*>& in_vars2rm,
const std::unordered_map<std::string, Node*>& out_vars2rm) {
auto op_info = op_node->AsStmt().mutable_op_info();
auto op_type = op_info->Type();
// Unlink the in_vars2rm and out_vars2rm from the control flow op node, and
// remove them if nerver used.
for (auto& var_node : in_vars2rm) {
VLOG(3) << "in var node '" << var_node.first << "' is unlinked to "
<< op_type;
RemoveDirectedLink(var_node.second, op_node);
}
for (auto& var_node : out_vars2rm) {
VLOG(3) << "out var node '" << var_node.first << "' is unlinked from "
<< op_type;
RemoveDirectedLink(op_node, var_node.second);
// Unlink from all of the out op nodes.
std::unordered_set<Node*> out_op_nodes;
for (auto* out_op_node : var_node.second->outlinks) {
if (!out_op_nodes.count(out_op_node)) {
out_op_nodes.insert(out_op_node);
}
}
for (auto* out_op_node : out_op_nodes) {
RemoveDirectedLink(var_node.second, out_op_node);
}
}
// Remove the unused nodes from the graph if their inlinks and outlinks are
// empty
std::unordered_set<const Node*> removed_var_nodes;
for (auto& var_node : in_vars2rm) {
if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
!removed_var_nodes.count(var_node.second)) {
removed_var_nodes.insert(var_node.second);
graph->RemoveNode(var_node.second);
VLOG(3) << "in var node " << var_node.first << " is removed";
}
}
for (auto& var_node : out_vars2rm) {
if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
!removed_var_nodes.count(var_node.second)) {
removed_var_nodes.insert(var_node.second);
graph->RemoveNode(var_node.second);
VLOG(3) << "out var node " << var_node.first << " is removed";
}
}
// Update the op info of the control flow op
for (auto& input : *op_info->mutable_inputs()) {
for (auto var = input.second.begin(); var != input.second.end();) {
if (in_vars2rm.count(*var)) {
var = input.second.erase(var);
} else {
++var;
}
}
}
for (auto& output : *op_info->mutable_outputs()) {
for (auto var = output.second.begin(); var != output.second.end();) {
if (out_vars2rm.count(*var)) {
var = output.second.erase(var);
} else {
++var;
}
}
}
}
void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs(
std::vector<std::unique_ptr<mir::SSAGraph>>* graphs) {
CHECK(graphs && !graphs->empty());
graphs_ = graphs;
}
void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
// Remove the unused input and output nodes from the control flow op nodes
// Which are only linked to the control flow op nodes but nerver linked to the
// other op nodes
const std::unordered_set<std::string> control_flow_op_types = {
"while", "conditional_block"};
auto block_size = graphs_->size();
for (auto& op_node : graph->StmtTopologicalOrder()) {
if (!op_node->IsStmt()) continue;
auto op_info = op_node->AsStmt().mutable_op_info();
auto op_type = op_info->Type();
if (!control_flow_op_types.count(op_type)) continue;
int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
CHECK(sub_block_idx >= 0 && sub_block_idx < block_size);
// Initialize the unused nodes with all of the input and output nodes
std::unordered_map<std::string, Node *> in_vars2rm, out_vars2rm;
for (auto* var_node : op_node->inlinks) {
auto& var_name = var_node->AsArg().name;
if (!in_vars2rm.count(var_name)) {
in_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
}
}
for (auto* var_node : op_node->outlinks) {
auto& var_name = var_node->AsArg().name;
if (!out_vars2rm.count(var_name)) {
out_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
}
}
// Remove the nodes which used in subblock recursively, and the remaining
// nodes are the unused one.
CollectUnusedInputOutputNodes(sub_block_idx,
graphs_,
control_flow_op_types,
&in_vars2rm,
&out_vars2rm);
if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) {
// Remove the unused nodes from graph, and update the op info of the
// control flow op
RemoveNodesFromGraphAndUpdateOpInfo(
graph.get(), op_node, in_vars2rm, out_vars2rm);
}
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(
control_flow_op_unused_inputs_and_outputs_eliminate_pass,
paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass)
.BindTargets({TARGET(kNPU)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <limits>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/mir/pass.h"
#include "lite/core/types.h"
namespace paddle {
namespace lite {
namespace mir {
class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass {
public:
void Apply(const std::unique_ptr<SSAGraph> &graph) override;
void SetAllGraphs(std::vector<std::unique_ptr<mir::SSAGraph>> *graphs);
private:
std::vector<std::unique_ptr<mir::SSAGraph>> *graphs_;
};
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -16,6 +16,9 @@ lite_cc_library(fuse_var_conv_activation
lite_cc_library(fuse_conv_bn
SRCS conv_bn_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_conv_conv
SRCS conv_conv_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_elementwise_add_activation
SRCS elementwise_add_activation_fuser.cc
DEPS pattern_matcher_high_api)
......@@ -42,6 +45,7 @@ set(mir_fusers
fuse_conv_activation
fuse_var_conv_activation
fuse_conv_bn
fuse_conv_conv
fuse_quant_dequant
fuse_elementwise_add_activation
fuse_transpose_softmax_transpose
......
......@@ -383,10 +383,10 @@ class XPUSingleEncoderFuser : public FuseBase {
op_desc.SetAttr<std::string>("act_type", act_type_);
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
auto* single_encoder_stmt = matched.at("q_mul")->stmt();
fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
......
......@@ -373,10 +373,10 @@ class XPUResNetCbamBlock0Fuser : public FuseBase {
auto block0_stmt = matched.at("left_conv1")->stmt();
// block0_stmt->ResetOp(op_desc, graph->valid_places());
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
block0_stmt->SetOp(fake_subgraph_op);
......@@ -693,10 +693,10 @@ class XPUResNetCbamBlock1Fuser : public FuseBase {
auto block1_stmt = matched.at("right_conv1")->stmt();
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
block1_stmt->SetOp(fake_subgraph_op);
......@@ -932,10 +932,10 @@ class XPUResNetCbamBlock2Fuser : public FuseBase {
<< "Y of last fc must have been transposed";
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, scope);
fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places());
block2_stmt->SetOp(fake_subgraph_op);
......
......@@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase {
auto block0_stmt = matched.at("left_conv1")->stmt();
// block0_stmt->ResetOp(op_desc, graph->valid_places());
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
block0_stmt->SetOp(fake_subgraph_op);
......@@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase {
auto block1_stmt = matched.at("right_conv1")->stmt();
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
block1_stmt->SetOp(fake_subgraph_op);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/conv_conv_fuse_pass.h"
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/conv_conv_fuser.h"
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// initialze fuser params
std::vector<bool> conv_has_bias_cases{true, false};
std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
bool has_arm = false;
for (auto& place : graph->valid_places()) {
if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
has_arm = true;
break;
}
}
if (!has_arm) {
return;
}
// only support fp32 fusion
for (auto conv_has_bias0 : conv_has_bias_cases) {
for (auto conv_has_bias1 : conv_has_bias_cases) {
for (auto conv_type0 : conv_type_cases) {
for (auto conv_type1 : conv_type_cases) {
VLOG(4) << "conv_has_bias0:" << conv_has_bias0
<< " conv_type0:" << conv_type0;
VLOG(4) << "conv_has_bias1:" << conv_has_bias1
<< " conv_type1:" << conv_type1;
fusion::ConvConvFuser fuser(
conv_type0, conv_type1, conv_has_bias0, conv_has_bias1);
fuser(graph.get());
}
}
}
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
.BindTargets({TARGET(kARM)});
......@@ -14,18 +14,19 @@
#pragma once
#include "lite/backends/xpu/xpu_header_sitter.h"
#include <memory>
#include <string>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace mir {
struct XPUFreeDeleter {
void operator()(void* p) const { xpu_free(p); }
class ConvConvFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace xpu
} // namespace kernels
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/conv_conv_fuser.h"
#include <memory>
#include <set>
#include <vector>
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
void ConvConvFuser::BuildPattern() {
auto* conv_input0 = VarNode("conv_input0")
->assert_is_op_input(conv_type0_, "Input")
->AsInput();
auto* conv_weight0 = VarNode("conv_weight0")
->assert_is_op_input(conv_type0_, "Filter")
->AsInput();
auto* conv0 = OpNode("conv2d0", conv_type0_)->assert_is_op(conv_type0_);
auto* conv_out0 = VarNode("conv_out0")
->assert_is_op_output(conv_type0_, "Output")
->assert_is_op_input(conv_type1_, "Input")
->AsIntermediate();
auto* conv_weight1 = VarNode("conv_weight1")
->assert_is_op_input(conv_type1_, "Filter")
->AsIntermediate();
auto* conv1 = OpNode("conv2d1", conv_type1_)
->assert_is_op(conv_type1_)
->assert_op_attr<int>("groups", 1)
->AsIntermediate();
auto* conv_out1 = VarNode("conv_out1")
->assert_is_op_output(conv_type1_, "Output")
->AsOutput();
if (conv_has_bias0_) {
if (conv_has_bias1_) {
auto* conv_bias0 = VarNode("conv_bias0")
->assert_is_op_input(conv_type0_, "Bias")
->AsIntermediate();
auto* conv_bias1 = VarNode("conv_bias1")
->assert_is_op_input(conv_type1_, "Bias")
->AsInput();
conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
.LinksTo({conv_out0});
conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
.LinksTo({conv_out1});
} else {
auto* conv_bias0 = VarNode("conv_bias0")
->assert_is_op_input(conv_type0_, "Bias")
->AsIntermediate();
conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
.LinksTo({conv_out0});
conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
}
} else {
conv0->LinksFrom({conv_input0, conv_weight0}).LinksTo({conv_out0});
if (conv_has_bias1_) {
auto* conv_bias1 = VarNode("conv_bias1")
->assert_is_op_input(conv_type1_, "Bias")
->AsInput();
conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
.LinksTo({conv_out1});
} else {
conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
}
}
}
void ConvConvFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
auto conv_instruct = matched.at("conv2d0")->stmt();
auto conv_op_desc = conv_instruct->mutable_op_info();
auto conv = conv_instruct->op();
auto* scope = conv->scope();
auto conv_op_desc1 = matched.at("conv2d1")->stmt()->mutable_op_info();
// conv0
auto weight0_t = scope->FindVar(matched.at("conv_weight0")->arg()->name)
->GetMutable<lite::Tensor>();
// conv1
auto weight1_t = scope->FindVar(matched.at("conv_weight1")->arg()->name)
->GetMutable<lite::Tensor>();
// auto groups0 = conv_op_desc->GetAttr<int>("groups");
auto groups1 = conv_op_desc1->GetAttr<int>("groups");
auto strides1 = conv_op_desc1->GetAttr<std::vector<int>>("strides");
auto paddings1 = conv_op_desc1->GetAttr<std::vector<int>>("paddings");
auto dilations1 = conv_op_desc1->GetAttr<std::vector<int>>("dilations");
bool enable0_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
bool enable1_int8 = conv_op_desc1->HasAttr("enable_int8") ? true : false;
int kw = weight1_t->dims()[2];
int kh = weight1_t->dims()[3];
if (!(kw == 1 && kh == 1)) {
return;
}
CHECK_EQ(enable0_int8, enable1_int8) << "The Conv compute type must be same";
CHECK_EQ(groups1, 1) << "The groups of weight1_dim must be 1";
CHECK_EQ(weight0_t->dims()[0], weight1_t->dims()[1])
<< "weight0_dims[0] == weight1_dim[1]";
for (int i = 0; i < strides1.size(); i++) {
CHECK_EQ(strides1[i], 1) << "strides[" << i << "]: " << strides1[i]
<< " must be 1";
}
for (int i = 0; i < paddings1.size(); i++) {
CHECK_EQ(paddings1[i], 0) << "paddings1[" << i << "]: " << paddings1[i]
<< " must be 0";
}
for (int i = 0; i < dilations1.size(); i++) {
CHECK_EQ(dilations1[i], 1) << "dilations1[" << i << "]: " << dilations1[i]
<< " must be 1";
}
// comupte new_wight and new bias
///////////////////////////////////////////////////////////////////////////////
// Compute ConvConvFuser
// Before fusion
//
// conv(x) = conv(x) = kx + z = y
// conv(y) = ay + b
//
// After fusion:
//
// conv(conv(x)) = a(kx + z) + b = akx + az + b
//
// new_weights = ak
// new_bias = az + b
///////////////////////////////////////////////////////////////////////////////
if (enable0_int8) {
LOG(FATAL) << "it doesn't support";
return;
} else {
// compute new conv_weight
Tensor weight_tensor;
auto in_dims = weight0_t->dims();
auto weight_dims = weight1_t->dims();
const float* din = weight0_t->data<float>();
const float* weights = weight1_t->data<float>();
int oc0 = in_dims[0];
int ic = in_dims[1];
int ih = in_dims[2];
int iw = in_dims[3];
int oc = weight_dims[0];
weight_tensor.Resize({oc, ic, ih, iw});
float* dout = weight_tensor.mutable_data<float>();
ComputeNewWeight(dout, din, weights, oc0, ic, ih, iw, oc);
weight0_t->CopyDataFrom(weight_tensor);
}
// compute new conv_bias
if (conv_has_bias0_ && conv_op_desc->HasInput("Bias") &&
conv_op_desc->Input("Bias").size() > 0) {
auto bias_t0 = scope->FindVar(matched.at("conv_bias0")->arg()->name)
->GetMutable<lite::Tensor>();
if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
conv_op_desc1->Input("Bias").size() > 0) {
auto bias_t1 = scope->FindVar(matched.at("conv_bias1")->arg()->name)
->GetMutable<lite::Tensor>();
Tensor bias;
bias.CopyDataFrom(*bias_t1);
auto bias_data = bias.mutable_data<float>();
ComputeNewBias(bias_data, bias_t0, weight1_t, bias_t1);
bias_t1->CopyDataFrom(bias);
conv_op_desc->SetInput(
"Bias", {matched.at("conv_bias1")->arg()->name}); // conv_bias
IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
} else {
Tensor bias;
auto weight_dims = weight1_t->dims();
bias.Resize({weight_dims[0]});
auto bias_d = bias.mutable_data<float>();
ComputeNewBias(bias_d, bias_t0, weight1_t, nullptr);
bias_t0->CopyDataFrom(bias);
conv_op_desc->SetInput(
"Bias", {matched.at("conv_bias0")->arg()->name}); // conv_bias
}
} else {
if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
conv_op_desc1->Input("Bias").size() > 0) {
conv_op_desc->SetInput(
"Bias", {matched.at("conv_bias1")->arg()->name}); // conv_bias
IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
}
}
conv_op_desc->SetType(conv_type0_);
conv_op_desc->SetInput("Input", {matched.at("conv_input0")->arg()->name});
conv_op_desc->SetInput("Filter", {matched.at("conv_weight0")->arg()->name});
conv_op_desc->SetOutput("Output", {matched.at("conv_out1")->arg()->name});
auto update_conv_desc = *conv_instruct->mutable_op_info();
conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
IR_OP_VAR_LINK(matched.at("conv2d0"), matched.at("conv_out1"));
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
此差异已折叠。
......@@ -34,20 +34,25 @@ void QuantDequantFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
}
// fuse quantized node and dequant node
for (auto& op_type :
{"conv2d", "mul", "depthwise_conv2d", "conv2d_transpose"}) {
std::vector<std::string> quantized_op_types = {
"conv2d", "depthwise_conv2d", "conv2d_transpose", "mul"};
for (auto& op_type : quantized_op_types) {
fusion::DequantOpFuser fuser(op_type);
fuser(graph.get());
}
for (auto& op_type : {"conv2d", "depthwise_conv2d", "conv2d_transpose"}) {
for (auto& op_type : quantized_op_types) {
fusion::ChannelWiseDequantOpFuser fuser(op_type);
fuser(graph.get());
}
// process quant_dequant_node
fusion::DeleteQuantDequantOpFuser dqd_fuser;
dqd_fuser(graph.get());
std::vector<std::string> quant_dequant_op_types = {
"fake_quantize_dequantize_abs_max",
"fake_quantize_dequantize_moving_average_abs_max"};
for (auto& op_type : quant_dequant_op_types) {
fusion::DeleteQuantDequantOpFuser dqd_fuser(op_type);
dqd_fuser(graph.get());
}
}
} // namespace mir
......
......@@ -315,30 +315,33 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) {
}
void DeleteQuantDequantOpFuser::BuildPattern() {
std::string quant_dequant_op_type =
"fake_quantize_dequantize_moving_average_abs_max";
auto* input_scale_node =
VarNode("input_scale_node")
->assert_is_op_input(quant_dequant_op_type, "InScale");
auto* input_act_node =
VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X");
auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type)
->assert_is_op(quant_dequant_op_type);
auto* input_act_node = VarNode("input_act_node")
->assert_is_op_input(quant_dequant_op_type_, "X");
auto* quant_dequant_node =
OpNode("quant_dequant_node", quant_dequant_op_type_)
->assert_is_op(quant_dequant_op_type_);
auto* output_scale_node =
VarNode("output_scale_node")
->assert_is_op_output(quant_dequant_op_type, "OutScale");
->assert_is_op_output(quant_dequant_op_type_, "OutScale");
auto* output_act_node =
VarNode("output_act_node")
->assert_is_op_output(quant_dequant_op_type, "Out");
quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
->assert_is_op_output(quant_dequant_op_type_, "Out");
if (quant_dequant_op_type_ ==
"fake_quantize_dequantize_moving_average_abs_max") {
auto* input_scale_node =
VarNode("input_scale_node")
->assert_is_op_input(quant_dequant_op_type_, "InScale");
quant_dequant_node->LinksFrom({input_scale_node, input_act_node});
} else {
quant_dequant_node->LinksFrom({input_act_node});
}
output_scale_node->LinksFrom({quant_dequant_node});
output_act_node->LinksFrom({quant_dequant_node});
}
void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
const key2nodes_t& matched) {
auto* input_scale_node = matched.at("input_scale_node");
auto* input_act_node = matched.at("input_act_node");
auto* quant_dequant_node = matched.at("quant_dequant_node");
auto* output_scale_node = matched.at("output_scale_node");
......@@ -368,7 +371,12 @@ void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph,
}
// delete nodes and edges
std::set<const Node*> nodes2rm = {
input_scale_node, quant_dequant_node, output_scale_node, output_act_node};
quant_dequant_node, output_scale_node, output_act_node};
if (quant_dequant_op_type_ ==
"fake_quantize_dequantize_moving_average_abs_max") {
auto* input_scale_node = matched.at("input_scale_node");
nodes2rm.insert(input_scale_node);
}
GraphSafeRemoveNodes(graph, nodes2rm);
}
......
......@@ -39,6 +39,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
nodes_in_order = graph->StmtTopologicalOrder();
}
insts_.emplace_back();
for (auto& item : nodes_in_order) {
if (item->IsStmt()) {
auto& stmt = item->AsStmt();
......@@ -57,7 +58,7 @@ void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
.SetSyncStreams(stmt.sync_streams_);
}
#endif
insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
insts_.back().emplace_back(stmt.op(), std::move(stmt.kernels().front()));
}
}
}
......
......@@ -315,4 +315,5 @@ REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
TARGET(kBM),
TARGET(kRKNPU),
TARGET(kAPU),
TARGET(kMLU)});
TARGET(kMLU),
TARGET(kHuaweiAscendNPU)});
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册