未验证 提交 fdf44c59 编写于 作者: L Leonardo-Ding 提交者: GitHub

Merge branch 'develop' into dwh_dev

......@@ -86,6 +86,7 @@ lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_HUAWEI_ASCEND_NPU "Enable HUAWEI_ASCEND_NPU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
......@@ -98,6 +99,7 @@ lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OF
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_WITH_LOG "Enable log printing or not." ON)
lite_option(LITE_WITH_EXCEPTION "Enable throwing the exception when error occurs in lite" OFF)
lite_option(LITE_WITH_NVTX "Enable nvtx or not, please enable LITE_WITH_CUDA first." OFF)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
......@@ -224,6 +226,11 @@ endif()
if(LITE_WITH_MLU)
include(mlu)
endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
include(device/huawei_ascend_npu)
endif()
include(coveralls)
include(external/mklml) # download mklml package
......
......@@ -174,6 +174,10 @@ if (LITE_WITH_MLU)
add_definitions("-DLITE_WITH_MLU")
endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
add_definitions("-DLITE_WITH_HUAWEI_ASCEND_NPU")
endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
endif()
......@@ -190,6 +194,10 @@ if (LITE_WITH_LOG)
add_definitions("-DLITE_WITH_LOG")
endif()
if (LITE_WITH_EXCEPTION)
add_definitions("-DLITE_WITH_EXCEPTION")
endif()
if (LITE_ON_TINY_PUBLISH)
add_definitions("-DLITE_ON_TINY_PUBLISH")
endif()
......
......@@ -80,6 +80,21 @@ if (ARM_TARGET_LANG STREQUAL "clang")
elseif(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(triple arm-v7a-linux-android)
set(LITE_WITH_OPENMP OFF CACHE STRING "Due to libomp's bug(For ARM64, it has been fixed by https://reviews.llvm.org/D19879, but still exists on ARM32), disable OpenMP on armv7 when cross-compiling using Clang" FORCE)
if(ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
# Use CMAKE_CXX_STANDARD_LIBRARIES_INIT to ensure libunwind and libc++ is linked in the right order
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libunwind.a")
if (ANDROID_API_LEVEL LESS 21)
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libandroid_support.a")
endif()
if(ANDROID_STL_TYPE STREQUAL "c++_shared")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_shared.so")
elseif(ANDROID_STL_TYPE STREQUAL "c++_static")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++_static.a")
set(CMAKE_CXX_STANDARD_LIBRARIES_INIT "${CMAKE_CXX_STANDARD_LIBRARIES_INIT} ${ANDROID_NDK}/sources/cxx-stl/llvm-libc++/libs/${ANDROID_ARCH_ABI}/libc++abi.a")
else()
message(FATAL_ERROR "Invalid Android STL TYPE: ${ANDROID_STL_TYPE}.")
endif()
endif()
else()
message(FATAL_ERROR "Clang do not support this ${ARM_TARGET_ARCH_ABI}, use armv8 or armv7")
endif()
......
......@@ -23,6 +23,21 @@ if(ANDROID)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -llog -fPIC")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog -fPIC")
# Don't re-export libgcc symbols
set(REMOVE_ATOMIC_GCC_SYMBOLS "-Wl,--exclude-libs,libatomic.a -Wl,--exclude-libs,libgcc.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_ATOMIC_GCC_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
# Only the libunwind.a from clang(with libc++) provide C++ exception handling support for 32-bit ARM
# Refer to https://android.googlesource.com/platform/ndk/+/master/docs/BuildSystemMaintainers.md#Unwinding
if (ARM_TARGET_LANG STREQUAL "clang" AND ARM_TARGET_ARCH_ABI STREQUAL "armv7" AND ANDROID_STL_TYPE MATCHES "^c\\+\\+_")
set(REMOVE_UNWIND_SYMBOLS "-Wl,--exclude-libs,libunwind.a")
set(CMAKE_SHARED_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_SHARED_LINKER_FLAGS}")
set(CMAKE_MODULE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_MODULE_LINKER_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${REMOVE_UNWIND_SYMBOLS} ${CMAKE_EXE_LINKER_FLAGS}")
endif()
endif()
if(ARMLINUX)
......@@ -59,14 +74,13 @@ function(check_linker_flag)
endfunction()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
if((LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang")) OR LITE_WITH_PYTHON OR LITE_WITH_EXCEPTION OR (NOT LITE_ON_TINY_PUBLISH))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions -fasynchronous-unwind-tables -funwind-tables")
else ()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions -fno-asynchronous-unwind-tables -fno-unwind-tables")
endif()
if (LITE_ON_TINY_PUBLISH)
if((NOT LITE_WITH_PYTHON))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-exceptions")
endif()
if(LITE_WITH_OPENCL AND (ARM_TARGET_LANG STREQUAL "clang"))
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexceptions")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer -fno-asynchronous-unwind-tables -fno-unwind-tables")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -ffast-math -Ofast -Os -fomit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden -ffunction-sections")
check_linker_flag(-Wl,--gc-sections)
endif()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
# 1. path to Huawei Ascend Install Path
if(NOT DEFINED HUAWEI_ASCEND_NPU_DDK_ROOT)
set(HUAWEI_ASCEND_NPU_DDK_ROOT $ENV{HUAWEI_ASCEND_NPU_DDK_ROOT})
if(NOT HUAWEI_ASCEND_NPU_DDK_ROOT)
message(FATAL_ERROR "Must set HUAWEI_ASCEND_NPU_DDK_ROOT or env HUAWEI_ASCEND_NPU_DDK_ROOT when LITE_WITH_HUAWEI_ASCEND_NPU=ON")
endif()
endif()
message(STATUS "HUAWEI_ASCEND_NPU_DDK_ROOT: ${HUAWEI_ASCEND_NPU_DDK_ROOT}")
# 2. Huawei Ascend include directory
set(ACL_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/include")
set(ATC_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/include")
set(OPP_INCLUDE_DIR "${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp")
include_directories(${ACL_INCLUDE_DIR})
include_directories(${ATC_INCLUDE_DIR})
include_directories(${OPP_INCLUDE_DIR})
# 3 find ACL Libs (ACL libs should before ATC libs)
find_library(ACL_ASCENDCL_FILE NAMES ascendcl
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_ASCENDCL_FILE)
message(FATAL_ERROR "Can not find ACL_ASCENDCL_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_ASCENDCL_FILE Library: ${ACL_ASCENDCL_FILE}")
add_library(acl_ascendcl SHARED IMPORTED GLOBAL)
set_property(TARGET acl_ascendcl PROPERTY IMPORTED_LOCATION ${ACL_ASCENDCL_FILE})
endif()
# 3.1 ascendcl dependency - libruntime.so
find_library(ACL_RUNTIME_FILE NAMES runtime
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64
NO_DEFAULT_PATH)
if(NOT ACL_RUNTIME_FILE)
message(FATAL_ERROR "Can not find ACL_RUNTIME_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/acllib/lib64")
else()
message(STATUS "Found ACL_RUNTIME_FILE Library: ${ACL_RUNTIME_FILE}")
add_library(acl_runtime SHARED IMPORTED GLOBAL)
set_property(TARGET acl_runtime PROPERTY IMPORTED_LOCATION ${ACL_RUNTIME_FILE})
endif()
# 4.1 find ATC libs - libregister.so
find_library(ATC_REGISTER_FILE NAMES register
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_REGISTER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_REGISTER_FILE Library: ${ATC_REGISTER_FILE}")
add_library(atc_register SHARED IMPORTED GLOBAL)
set_property(TARGET atc_register PROPERTY IMPORTED_LOCATION ${ATC_REGISTER_FILE})
endif()
# 4.1.1 dependency of register - libprotobuf.so.19,
find_library(ATC_PROTOBUF_FILE NAMES libprotobuf.so.19
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_REGISTER_FILE)
message(FATAL_ERROR "Can not find ATC_PROTOBUF_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_PROTOBUF_FILE Library: ${ATC_PROTOBUF_FILE}")
add_library(atc_protobuf SHARED IMPORTED GLOBAL)
set_property(TARGET atc_protobuf PROPERTY IMPORTED_LOCATION ${ATC_PROTOBUF_FILE})
endif()
# 4.1.2 dependency of register - libgraph.so
find_library(ATC_GRAPH_FILE NAMES graph
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GRAPH_FILE)
message(FATAL_ERROR "Can not find ATC_GRAPH_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GRAPH_FILE Library: ${ATC_GRAPH_FILE}")
add_library(atc_graph SHARED IMPORTED GLOBAL)
set_property(TARGET atc_graph PROPERTY IMPORTED_LOCATION ${ATC_GRAPH_FILE})
endif()
# 4.2 find ATC libs - libge_compiler.so
find_library(ATC_GE_COMPILER_FILE NAMES ge_compiler
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMPILER_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMPILER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMPILER_FILE Library: ${ATC_GE_COMPILER_FILE}")
add_library(atc_ge_compiler SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_compiler PROPERTY IMPORTED_LOCATION ${ATC_GE_COMPILER_FILE})
endif()
# 4.2.1 dependencies of libge_compiler.so - libge_common.so
find_library(ATC_GE_COMMON_FILE NAMES ge_common
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_GE_COMMON_FILE)
message(FATAL_ERROR "Can not find ATC_GE_COMMON_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_GE_COMMON_FILE Library: ${ATC_GE_COMMON_FILE}")
add_library(atc_ge_common SHARED IMPORTED GLOBAL)
set_property(TARGET atc_ge_common PROPERTY IMPORTED_LOCATION ${ATC_GE_COMMON_FILE})
endif()
# 4.2.3 dependencies of libge_compiler.so - libresource.so
find_library(ATC_RESOURCE_FILE NAMES resource
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_RESOURCE_FILE)
message(FATAL_ERROR "Can not find ATC_RESOURCE_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_RESOURCE_FILE Library: ${ATC_RESOURCE_FILE}")
add_library(atc_resource SHARED IMPORTED GLOBAL)
set_property(TARGET atc_resource PROPERTY IMPORTED_LOCATION ${ATC_RESOURCE_FILE})
endif()
# 4.3 find OPP libs - libopsproto.so
find_library(OPP_OPS_PROTO_FILE NAMES opsproto
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in
NO_DEFAULT_PATH)
if(NOT OPP_OPS_PROTO_FILE)
message(FATAL_ERROR "Can not find OPP_OPS_PROTO_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/opp/op_proto/built-in")
else()
message(STATUS "Found OPP_OPS_PROTO_FILE Library: ${OPP_OPS_PROTO_FILE}")
add_library(opp_ops_proto SHARED IMPORTED GLOBAL)
set_property(TARGET opp_ops_proto PROPERTY IMPORTED_LOCATION ${OPP_OPS_PROTO_FILE})
endif()
# 4.3.1 dependency of opp_ops_proto - liberror_manager.so
find_library(ATC_ERROR_MANAGER_FILE NAMES error_manager
PATHS ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64
NO_DEFAULT_PATH)
if(NOT ATC_ERROR_MANAGER_FILE)
message(FATAL_ERROR "Can not find ATC_ERROR_MANAGER_FILE in ${HUAWEI_ASCEND_NPU_DDK_ROOT}/atc/lib64")
else()
message(STATUS "Found ATC_ERROR_MANAGER_FILE Library: ${ATC_ERROR_MANAGER_FILE}")
add_library(atc_error_manager SHARED IMPORTED GLOBAL)
set_property(TARGET atc_error_manager PROPERTY IMPORTED_LOCATION ${ATC_ERROR_MANAGER_FILE})
endif()
# note: huawei_ascend_npu_runtime_libs should before huawei_ascend_npu_builder_libs
set(huawei_ascend_npu_runtime_libs acl_ascendcl acl_runtime CACHE INTERNAL "huawei_ascend_npu acllib runtime libs")
set(huawei_ascend_npu_builder_libs atc_register atc_protobuf atc_graph opp_ops_proto atc_error_manager
atc_ge_compiler atc_ge_common atc_resource CACHE INTERNAL "huawei_ascend_npu atc builder libs")
\ No newline at end of file
......@@ -54,6 +54,11 @@ find_library(NPU_DDK_IR_BUILD_FILE NAMES hiai_ir_build
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
# Added in HiAI DDK 320 or later version
find_library(NPU_DDK_HCL_FILE NAMES hcl
PATHS ${NPU_DDK_ROOT}/${NPU_SUB_LIB_PATH}
NO_DEFAULT_PATH)
if(NOT NPU_DDK_HIAI_FILE)
message(FATAL_ERROR "Can not find NPU_DDK_HIAI_FILE in ${NPU_DDK_ROOT}")
else()
......@@ -78,5 +83,13 @@ else()
set_property(TARGET npu_ddk_ir_build PROPERTY IMPORTED_LOCATION ${NPU_DDK_IR_BUILD_FILE})
endif()
set(npu_runtime_libs npu_ddk_hiai CACHE INTERNAL "npu ddk runtime libs")
if(NOT NPU_DDK_HCL_FILE)
# message(FATAL_ERROR "Can not find NPU_DDK_HCL_FILE in ${NPU_DDK_ROOT}")
else()
message(STATUS "Found NPU_DDK HCL Library: ${NPU_DDK_HCL_FILE}")
add_library(npu_ddk_hcl SHARED IMPORTED GLOBAL)
set_property(TARGET npu_ddk_hcl PROPERTY IMPORTED_LOCATION ${NPU_DDK_HCL_FILE})
endif()
set(npu_runtime_libs npu_ddk_hiai npu_ddk_hcl CACHE INTERNAL "npu ddk runtime libs")
set(npu_builder_libs npu_ddk_ir npu_ddk_ir_build CACHE INTERNAL "npu ddk builder libs")
......@@ -94,12 +94,10 @@ function(compile_flatbuffers_schema_to_cpp_opt TARGET SRC_FBS OPT)
message(STATUS "SRC_FBS_DIR: ${SRC_FBS_DIR}")
string(REGEX REPLACE "\\.fbs$" "_generated.h" GEN_HEADER ${SRC_FBS})
add_custom_command(
OUTPUT ${GEN_HEADER}
OUTPUT "${CMAKE_CURRENT_SOURCE_DIR}/${GEN_HEADER}"
COMMAND "${FLATBUFFERS_FLATC_EXECUTABLE}"
--cpp --gen-mutable --gen-object-api --reflect-names
--force-empty --force-empty-vectors
${OPT}
-I "${CMAKE_CURRENT_SOURCE_DIR}/tests/include_test"
-o "${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS_DIR}"
"${CMAKE_CURRENT_SOURCE_DIR}/${SRC_FBS}"
DEPENDS flatbuffers
......
......@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -118,6 +118,12 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(var ${lite_deps_HUAWEI_ASCEND_NPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
set(${TARGET} ${deps} PARENT_SCOPE)
endfunction()
......@@ -143,7 +149,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -165,6 +171,7 @@ function(lite_cc_library TARGET)
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
if (args_SHARED OR ARGS_shared)
......@@ -193,7 +200,7 @@ function(lite_cc_binary TARGET)
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -215,6 +222,7 @@ function(lite_cc_binary TARGET)
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
if(NOT WIN32)
......@@ -246,7 +254,7 @@ function(lite_cc_test TARGET)
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL # (basic|extra)
......@@ -276,6 +284,7 @@ function(lite_cc_test TARGET)
HVY_DEPS ${args_HVY_DEPS}
CV_DEPS ${args_CV_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
)
_lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
# strip binary target to reduce size
......@@ -304,6 +313,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
set(apu_kernels CACHE INTERNAL "apu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(huawei_ascend_npu_kernels CACHE INTERNAL "huawei_ascend_npu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels")
set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
......@@ -321,12 +331,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# device: one of (Host, ARM, X86, NPU, MLU, HUAWEI_ASCEND_NPU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -438,6 +448,15 @@ function(add_kernel TARGET device level)
endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "HUAWEI_ASCEND_NPU")
if (NOT LITE_WITH_HUAWEI_ASCEND_NPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(huawei_ascend_npu_kernels "${huawei_ascend_npu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "OPENCL")
if (NOT LITE_WITH_OPENCL)
foreach(src ${args_SRCS})
......@@ -481,6 +500,7 @@ function(add_kernel TARGET device level)
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -499,7 +519,7 @@ endif()
function(add_operator TARGET level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS HUAWEI_ASCEND_NPU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -537,6 +557,7 @@ function(add_operator TARGET level)
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
HUAWEI_ASCEND_NPU_DEPS ${args_HUAWEI_ASCEND_NPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......
......@@ -37,14 +37,25 @@ rm ./lite/api/paddle_use_kernels.h
rm ./lite/api/paddle_use_ops.h
# 设置编译参数并开始编译
# android-armv7:cpu+gpu+cv+extra
./lite/tools/build_android.sh \
--arch=armv7 \
--toolchain=clang \
--with_cv=OFF \
--with_log=OFF \
--with_extra=OFF \
--with_extra=ON \
--with_cv=ON \
--with_opencl=ON
# android-armv8:cpu+gpu+cv+extra
./lite/tools/build_android.sh \
--arch=armv8 \
--toolchain=clang \
--with_log=OFF \
--with_extra=ON \
--with_cv=ON \
--with_opencl=ON
# 注:编译帮助请执行: ./lite/tools/build_android.sh help
```
......@@ -206,7 +217,7 @@ adb shell "export GLOG_v=4; \
## 3. 如何在Code中使用
即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc);
即编译产物`demo/cxx/mobile_light`目录下的代码,在线版参考GitHub仓库[./lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc](https://github.com/PaddlePaddle/Paddle-Lite/blob/develop/lite/demo/cxx/mobile_light/mobilenetv1_light_api.cc),其中也包括判断当前设备是否支持OpenCL的方法;
注:这里给出的链接会跳转到线上最新develop分支的代码,很可能与您本地的代码存在差异,建议参考自己本地位于`lite/demo/cxx/`目录的代码,查看如何使用。
......
......@@ -13,6 +13,7 @@ message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_HUAWEI_ASCEND_NPU:\t${LITE_WITH_HUAWEI_ASCEND_NPU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
message(STATUS "LITE_WITH_PROFILE:\t${LITE_WITH_PROFILE}")
message(STATUS "LITE_WITH_CV:\t${LITE_WITH_CV}")
......@@ -45,6 +46,7 @@ if (WITH_TESTING)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4_simple.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "MobileNetV1_quant.tar.gz")
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "transformer_with_mask_fp32.tar.gz")
endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "GoogleNet_inference.tar.gz")
......
......@@ -11,7 +11,7 @@ endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR LITE_WITH_HUAWEI_ASCEND_NPU OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full)
......@@ -40,13 +40,14 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
if(WIN32)
target_link_libraries(paddle_light_api_shared shlwapi.lib)
endif()
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${rknpu_kernels} ${apu_kernels})
if(APPLE)
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/exported_symbols.lds")
set(LINK_FLAGS "-Wl,-exported_symbols_list, ${LINK_MAP_FILE}")
......@@ -94,6 +95,7 @@ if (WITH_TESTING)
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
APU_DEPS ${apu_kernels})
endif()
......@@ -112,6 +114,10 @@ if(LITE_WITH_RKNPU)
set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
endif()
if(LITE_WITH_HUAWEI_ASCEND_NPU)
set(light_api_deps ${light_api_deps} ${huawei_ascend_npu_deps})
set(cxx_api_deps ${cxx_api_deps} ${huawei_ascend_npu_deps})
endif()
message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}")
......@@ -126,6 +132,7 @@ message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}")
message(STATUS "get HUAWEI_ASCEND_NPU kernels ${huawei_ascend_npu_kernels}")
# for full api
if (NOT LITE_ON_TINY_PUBLISH)
......@@ -144,7 +151,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
FPGA_DEPS ${fpga_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif()
# for light api
......@@ -168,7 +176,8 @@ lite_cc_library(light_api SRCS light_api.cc
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
include(ExternalProject)
set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
......@@ -191,6 +200,7 @@ if(WITH_TESTING)
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
EXCLUDE_COMPILE_DEPS "ON"
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
--optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -322,7 +332,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels})
BM_DEPS ${bm_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
# The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps})
......@@ -394,6 +405,7 @@ if(NOT WITH_COVERAGE)
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
if (WITH_TESTING)
add_dependencies(test_paddle_api extern_lite_download_lite_naive_model_tar_gz)
......@@ -415,7 +427,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_detection_bin SRCS model_test_detection.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -430,7 +443,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(test_model_classify_bin SRCS model_test_classify.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -445,7 +459,8 @@ if(NOT IOS)
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(benchmark_bin SRCS benchmark.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -459,7 +474,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
lite_cc_binary(multithread_test SRCS lite_multithread_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
......@@ -470,6 +486,7 @@ if(NOT IOS)
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -487,7 +504,8 @@ if(NOT IOS)
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
CUDA_DEPS ${cuda_kernels}
HUAWEI_ASCEND_NPU_DEPS ${huawei_ascend_npu_kernels})
endif()
#lite_cc_binary(cxx_api_bin SRCS cxx_api_bin.cc
......
......@@ -37,8 +37,7 @@ void Predictor::SaveModel(const std::string &dir,
if (!program_) {
GenRuntimeProgram();
}
program_->SaveOpInfosToProgram(program_desc_.get());
program_->UpdateVarsOfProgram(program_desc_.get());
program_->SaveToProgram(program_desc_);
switch (model_type) {
case lite_api::LiteModelType::kProtobuf:
SaveModelPb(dir, *program_->exec_scope(), *program_desc_.get(), true);
......@@ -58,18 +57,22 @@ void Predictor::SaveModel(const std::string &dir,
void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
std::set<std::string> ops_info;
std::set<std::string> kernels_info;
const auto &instructions_ = program_->instructions();
for (auto &node : instructions_) {
auto block_size = program_->block_size();
for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
const auto &insts = program_->instructions(block_idx);
for (auto &inst : insts) {
// parse op type infomation
auto op = node.op()->op_info();
auto op = inst.op()->op_info();
ops_info.insert(op->Type());
// parse kernel type information
std::string kernel_type_str =
node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
"," + PrecisionRepr(node.kernel()->precision()) + "," +
DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
inst.kernel()->op_type() + "," + TargetRepr(inst.kernel()->target()) +
"," + PrecisionRepr(inst.kernel()->precision()) + "," +
DataLayoutRepr(inst.kernel()->layout()) + "," +
inst.kernel()->alias();
kernels_info.insert(kernel_type_str);
}
}
// get souce_file name from op type and kernel type
auto op2pathmap = OpKernelInfoCollector::Global().GetOp2PathDict();
......@@ -170,9 +173,9 @@ void Predictor::PrepareFeedFetch() {
std::vector<const cpp::OpDesc *> feeds;
std::vector<const cpp::OpDesc *> fetchs;
const auto &insts = program_->instructions();
for (size_t i = 0; i < program_->num_instructions(); i++) {
const auto &op = insts[i].op()->op_info();
const auto &insts = program_->instructions(kRootBlockIdx);
for (auto &inst : insts) {
const auto &op = inst.op()->op_info();
if (op->Type() == "feed") {
feeds.push_back(op);
} else if (op->Type() == "fetch") {
......@@ -255,7 +258,6 @@ void Predictor::Build(const lite_api::CxxConfig &config,
} else {
LOG(INFO) << "Load model from file.";
}
Build(model_path,
model_file,
param_file,
......@@ -296,10 +298,10 @@ void Predictor::Build(const std::string &model_path,
Build(program_desc_, valid_places, passes);
}
void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &program_desc,
const std::vector<Place> &valid_places,
const std::vector<std::string> &passes) {
program_desc_ = desc;
program_desc_ = program_desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places;
for (auto &valid_place : valid_places) {
......@@ -336,7 +338,7 @@ void Predictor::Build(const std::shared_ptr<cpp::ProgramDesc> &desc,
Place{TARGET(kARM), PRECISION(kInt8)});
}
Program program(*desc.get(), scope_, inner_places);
Program program(program_desc_, scope_, inner_places);
valid_places_ = inner_places;
core::KernelPickFactor factor;
......
......@@ -49,18 +49,33 @@ class LITE_API Predictor {
program_desc_ = std::make_shared<cpp::ProgramDesc>();
}
// Create a predictor with the weight variable scope set.
///////////////////////////////////////////////////////////////////
// Function: Predictor
// Usage: Constructor of Predictor. Create a predictor with the
// weight variable scope set given.
///////////////////////////////////////////////////////////////////
explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
: scope_(root_scope) {}
Predictor(const std::shared_ptr<cpp::ProgramDesc>& desc,
///////////////////////////////////////////////////////////////////
// Function: Predictor
// Usage: Constructor of Predictor. This constructor function can
// only be called in Predictor->Clone. This Function will create
// a predictor from existed ProgramDesc, Scope and RuntimeProgram.
///////////////////////////////////////////////////////////////////
Predictor(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::shared_ptr<Scope>& root,
const std::vector<Place>& valid_places,
const std::vector<std::string>& var_names = {})
: program_desc_(desc), scope_(root) {
Program program(*desc.get(), scope_, valid_places, var_names);
optimizer_ = Optimizer(std::move(program), valid_places);
exec_scope_ = optimizer_.exec_scope();
: program_desc_(program_desc), scope_(root) {
// step1. Create a Program to construct the exec_scope and ops
Program program(program_desc_, scope_, valid_places, var_names);
exec_scope_ = program.exec_scope();
valid_places_ = valid_places;
// step3. Create the RuntimeProgram.
program_.reset(
new RuntimeProgram(program_desc_, exec_scope_, kRootBlockIdx));
program_generated_ = true;
}
// Build from a model, with places set for hardware config.
......@@ -79,32 +94,62 @@ class LITE_API Predictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool memory_from_memory = false);
void Build(const std::shared_ptr<cpp::ProgramDesc>& desc,
void Build(const std::shared_ptr<cpp::ProgramDesc>& program_desc,
const std::vector<Place>& valid_places,
const std::vector<std::string>& passes = {});
std::shared_ptr<Predictor> Clone() const {
//////////////////////////////////////////////////////////
// Function: Clone
// Usage: Create a Predictor from an existed one,
// the cloned predictor will share persistable variables
// in scope_ with the original predictor.
//////////////////////////////////////////////////////////
std::shared_ptr<Predictor> Clone() {
// step 1. Generate runtime_program, update op_info and var_info in
// program_desc_
if (!program_generated_) {
GenRuntimeProgram();
}
program_->SaveToProgram(program_desc_);
// step 2. Create a predictor friom current program_desc_ and
// runtime_program.
auto predictor =
std::make_shared<Predictor>(program_desc_, scope_, valid_places_);
// step3. Return the result
return predictor;
}
std::shared_ptr<Predictor> Clone(
const std::vector<std::string>& var_names) const {
//////////////////////////////////////////////////////////
// Function: Clone(var_names)
// Usage: Create a Predictor from an existed one,
// the cloned predictor will share persistable variables
// but persistable variables of name var_names will not
// be shared.
//////////////////////////////////////////////////////////
std::shared_ptr<Predictor> Clone(const std::vector<std::string>& var_names) {
CHECK(program_desc_) << "Both program and scope of current predicotr "
"should be not be nullptr in Clone mode.";
CHECK(scope_) << "Both program and scope of current predicotr should be "
"not be nullptr in Clone mode.";
// step 1. Generate runtime_program, update op_info and var_info in
// program_desc_
if (!program_generated_) {
GenRuntimeProgram();
}
program_->SaveToProgram(program_desc_);
// step 2. Create a predictor friom current program_desc_ and
// runtime_program.
auto predictor = std::make_shared<Predictor>(
program_desc_, scope_, valid_places_, var_names);
for (auto i : var_names) {
predictor->exec_scope_->LocalVar(i);
auto* tensor = predictor->scope_->Var(i)->GetMutable<lite::Tensor>();
// step3. Copy some persistable variables into private scope.
for (auto var_name : var_names) {
predictor->exec_scope_->LocalVar(var_name);
auto* tensor =
predictor->scope_->Var(var_name)->GetMutable<lite::Tensor>();
auto* sub_tensor =
predictor->exec_scope_->Var(i)->GetMutable<lite::Tensor>();
predictor->exec_scope_->Var(var_name)->GetMutable<Tensor>();
sub_tensor->CopyDataFrom(*tensor);
}
// step4. Return the result
return predictor;
}
......@@ -140,6 +185,7 @@ class LITE_API Predictor {
// get a const tensor according to its name
const lite::Tensor* GetTensor(const std::string& name) const;
const RuntimeProgram& runtime_program() const;
Scope* scope() { return scope_.get(); }
// This method is disabled in mobile, for unnecessary dependencies required.
void SaveModel(
......@@ -162,7 +208,7 @@ class LITE_API Predictor {
std::shared_ptr<cpp::ProgramDesc> program_desc_;
std::shared_ptr<Scope> scope_;
Scope* exec_scope_;
std::unique_ptr<RuntimeProgram> program_;
std::shared_ptr<RuntimeProgram> program_;
bool program_generated_{false};
std::vector<std::string> input_names_;
std::vector<std::string> output_names_;
......
......@@ -74,7 +74,15 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
mode_ = config.power_mode();
threads_ = config.threads();
#ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
......
......@@ -22,17 +22,18 @@ namespace lite {
void LightPredictor::Build(const std::string& lite_model_file,
bool model_from_memory) {
if (model_from_memory) {
LoadModelNaiveFromMemory(lite_model_file, scope_.get(), &cpp_program_desc_);
LoadModelNaiveFromMemory(
lite_model_file, scope_.get(), program_desc_.get());
} else {
LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
LoadModelNaiveFromFile(lite_model_file, scope_.get(), program_desc_.get());
}
// For weight quantization of post training, load the int8/16 weights
// for optimized model, and dequant it to fp32.
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_);
BuildRuntimeProgram(program_desc_);
PrepareFeedFetch();
program_desc_.reset();
}
void LightPredictor::Build(const std::string& model_dir,
......@@ -43,15 +44,15 @@ void LightPredictor::Build(const std::string& model_dir,
switch (model_type) {
#ifndef LITE_ON_TINY_PUBLISH
case lite_api::LiteModelType::kProtobuf:
LoadModelPb(model_dir, "", "", scope_.get(), &cpp_program_desc_);
LoadModelPb(model_dir, "", "", scope_.get(), program_desc_.get());
break;
#endif
case lite_api::LiteModelType::kNaiveBuffer: {
if (model_from_memory) {
LoadModelNaiveFromMemory(
model_buffer, param_buffer, scope_.get(), &cpp_program_desc_);
model_buffer, param_buffer, scope_.get(), program_desc_.get());
} else {
LoadModelNaive(model_dir, scope_.get(), &cpp_program_desc_);
LoadModelNaive(model_dir, scope_.get(), program_desc_.get());
}
break;
}
......@@ -60,7 +61,7 @@ void LightPredictor::Build(const std::string& model_dir,
}
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_);
BuildRuntimeProgram(program_desc_);
PrepareFeedFetch();
}
......@@ -109,15 +110,17 @@ std::vector<std::string> LightPredictor::GetOutputNames() {
}
// append the names of inputs and outputs into input_names_ and output_names_
void LightPredictor::PrepareFeedFetch() {
auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
std::vector<cpp::OpDesc*> feeds;
std::vector<cpp::OpDesc*> fetchs;
for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i);
if (op->Type() == "feed") {
feeds.push_back(op);
} else if (op->Type() == "fetch") {
fetchs.push_back(op);
std::vector<const cpp::OpDesc*> feeds;
std::vector<const cpp::OpDesc*> fetchs;
std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
auto main_block = program_desc->GetBlock<cpp::BlockDesc>(kRootBlockIdx);
auto op_size = main_block->OpsSize();
for (size_t op_idx = 0; op_idx < op_size; ++op_idx) {
auto op_desc = main_block->GetOp<cpp::OpDesc>(op_idx);
if (op_desc->Type() == "feed") {
feeds.push_back(op_desc);
} else if (op_desc->Type() == "fetch") {
fetchs.push_back(op_desc);
}
}
input_names_.resize(feeds.size());
......@@ -132,54 +135,35 @@ void LightPredictor::PrepareFeedFetch() {
}
}
void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
std::vector<Instruction> insts;
// 1. Create op first
Program program(prog, scope_, {});
// 2. Create Instructs
#ifdef LITE_WITH_OPENCL
using OpenCLContext = Context<TargetType::kOpenCL>;
std::unique_ptr<KernelContext> local_ctx(new KernelContext());
local_ctx->As<OpenCLContext>().InitOnce();
#endif
// Create the kernels of the target places, and filter out the specific
// kernel with the target alias.
for (auto& op : program.ops()) {
auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
std::string op_type, alias;
Place place;
KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
auto kernels = op->CreateKernels({place});
// filter out a kernel
auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& it) {
return it->alias() == alias;
});
CHECK(it != kernels.end());
#ifdef LITE_WITH_OPENCL
if ((*it)->target() == TARGET(kOpenCL)) {
std::unique_ptr<KernelContext> ctx(new KernelContext());
(*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
(*it)->SetContext(std::move(ctx));
void LightPredictor::BuildRuntimeProgram(
const std::shared_ptr<const cpp::ProgramDesc>& program_desc) {
auto* exe_scope = &scope_->NewScope();
// Prepare workspace
scope_->Var("feed")->GetMutable<std::vector<lite::Tensor>>();
scope_->Var("fetch")->GetMutable<std::vector<lite::Tensor>>();
CHECK(program_desc);
auto block_size = program_desc->BlocksSize();
CHECK(block_size);
for (size_t block_idx = 0; block_idx < block_size; ++block_idx) {
auto block_desc = program_desc->GetBlock<cpp::BlockDesc>(block_idx);
auto var_size = block_desc->VarsSize();
for (size_t var_idx = 0; var_idx < var_size; ++var_idx) {
auto var_desc = block_desc->GetVar<cpp::VarDesc>(var_idx);
if (!var_desc->Persistable()) {
exe_scope->Var(var_desc->Name());
} else {
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") continue;
scope_->Var(var_desc->Name());
}
#else
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
#endif
insts.emplace_back(op, std::move(*it));
}
program_.reset(new RuntimeProgram(std::move(insts)));
CHECK(program.exec_scope());
program_->set_exec_scope(program.exec_scope());
}
// Only extracting the ops and generate the runtime program from the main
// block desc
program_.reset(new RuntimeProgram(program_desc, exe_scope, kRootBlockIdx));
}
void LightPredictor::DequantizeWeight() {
std::shared_ptr<const cpp::ProgramDesc> program_desc = program_desc_;
#define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < ch; ++i) { \
for (int64_t j = 0; j < offset; ++j) { \
......@@ -205,10 +189,9 @@ void LightPredictor::DequantizeWeight() {
}
return result;
};
Tensor tmp_tensor;
for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
for (size_t i = 0; i < program_desc->BlocksSize(); i++) {
auto* block = program_desc->GetBlock<cpp::BlockDesc>(i);
for (size_t k = 0; k < block->OpsSize(); ++k) {
auto* op_desc = block->GetOp<cpp::OpDesc>(k);
if (is_weight_quantized_op(op_desc)) {
......
......@@ -46,6 +46,7 @@ class LITE_API LightPredictor {
LightPredictor(const std::string& lite_model_file,
bool model_from_memory = false) {
scope_ = std::make_shared<Scope>();
program_desc_ = std::make_shared<cpp::ProgramDesc>();
Build(lite_model_file, model_from_memory);
}
......@@ -57,6 +58,7 @@ class LITE_API LightPredictor {
lite_api::LiteModelType model_type =
lite_api::LiteModelType::kNaiveBuffer) {
scope_ = std::make_shared<Scope>();
program_desc_ = std::make_shared<cpp::ProgramDesc>();
Build(model_dir, model_buffer, param_buffer, model_type, model_from_memory);
}
......@@ -78,6 +80,7 @@ class LITE_API LightPredictor {
std::vector<std::string> GetInputNames();
std::vector<std::string> GetOutputNames();
void PrepareFeedFetch();
Scope* scope() { return scope_.get(); }
private:
void Build(const std::string& lite_model_file,
......@@ -91,14 +94,15 @@ class LITE_API LightPredictor {
lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
bool model_from_memory = false);
void BuildRuntimeProgram(const cpp::ProgramDesc& prog);
void BuildRuntimeProgram(
const std::shared_ptr<const cpp::ProgramDesc>& program_desc);
void DequantizeWeight();
private:
std::shared_ptr<Scope> scope_;
std::unique_ptr<RuntimeProgram> program_;
cpp::ProgramDesc cpp_program_desc_;
std::shared_ptr<cpp::ProgramDesc> program_desc_;
std::vector<std::string> input_names_;
std::vector<std::string> output_names_;
};
......
......@@ -38,7 +38,15 @@ void LightPredictorImpl::Init(const lite_api::MobileConfig& config) {
threads_ = config.threads();
#ifdef LITE_WITH_NPU
// Store the model-level configuration into scope for kernels, and use
// exe_scope to store the execution-level configuration
Context<TargetType::kNPU>::SetSubgraphModelCacheDir(
raw_predictor_->scope(), config.subgraph_model_cache_dir());
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
Context<TargetType::kHuaweiAscendNPU>::SetHuaweiAscendDeviceID(
config.get_device_id());
Context<TargetType::kHuaweiAscendNPU>::SetSubgraphModelCacheDir(
config.subgraph_model_cache_dir());
#endif
}
......
......@@ -97,7 +97,7 @@ void TestModel(const std::vector<Place>& valid_places,
if (first_target == TARGET(kOpenCL) || first_target == TARGET(kNPU)) {
ASSERT_EQ(out->dims().production(), 1000);
double eps = first_target == TARGET(kOpenCL) ? 0.15 : 0.1;
double eps = first_target == TARGET(kOpenCL) ? 0.25 : 0.1;
for (int i = 0; i < ref.size(); ++i) {
for (int j = 0; j < ref[i].size(); ++j) {
auto result = pdata[j * step + (out->dims()[1] * i)];
......
......@@ -112,6 +112,8 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
} else if (target_repr == "npu") {
valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") {
......@@ -201,6 +203,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kXPU",
"kRKNPU",
"kAPU",
"kHuaweiAscendNPU",
"kAny",
"kUnk"};
int maximum_optype_length = 0;
......@@ -265,16 +268,17 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" "
"`--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n"
" `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`"
" Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
......
......@@ -73,6 +73,8 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.emplace_back(TARGET(kX86));
} else if (target_repr == "npu") {
valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "huawei_ascend_npu") {
valid_places_.emplace_back(TARGET(kHuaweiAscendNPU));
} else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") {
......@@ -237,7 +239,8 @@ void OptBase::PrintHelpInfo() {
" `set_model_type(protobuf|naive_buffer)`: naive_buffer by "
"default\n"
" `set_lite_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" "
"`set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu|huawei_ascend_npu)`\n"
" `record_model_info(false|true)`: refer to whether to record ops "
"info for striping lib, false by default`\n"
" `run() : start model transformation`\n"
......@@ -274,16 +277,16 @@ void OptBase::PrintExecutableBinHelpInfo() {
" `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n"
" `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|huawei_ascend_npu)`"
" Display operators in the input model\n";
std::cout << "paddlelite opt version:" << opt_version << std::endl
<< help_info << std::endl;
......@@ -301,6 +304,7 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
"kXPU",
"kRKNPU",
"kAPU",
"kHuaweiAscendNPU",
"kAny",
"kUnk"};
// Get the lengh of the first column: maximum length of the op_type
......
......@@ -32,9 +32,22 @@
#include "lite/backends/mlu/target_wrapper.h"
#endif
#ifdef LITE_WITH_OPENCL
#include "lite/backends/opencl/cl_runtime.h"
#endif
namespace paddle {
namespace lite_api {
bool IsOpenCLBackendValid() {
bool opencl_valid = false;
#ifdef LITE_WITH_OPENCL
opencl_valid = paddle::lite::CLRuntime::Global()->OpenCLAvaliableForDevice();
#endif
LOG(INFO) << "opencl_valid:" << opencl_valid;
return opencl_valid;
}
Tensor::Tensor(void *raw) : raw_tensor_(raw) {}
// TODO(Superjomn) refine this by using another `const void* const_raw`;
......
......@@ -33,6 +33,9 @@ using lod_t = std::vector<std::vector<uint64_t>>;
enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK };
// return true if current device supports OpenCL model
LITE_API bool IsOpenCLBackendValid();
struct LITE_API Tensor {
explicit Tensor(void* raw);
explicit Tensor(const void* raw);
......@@ -123,6 +126,7 @@ class LITE_API ConfigBase {
PowerMode mode_{LITE_POWER_NO_BIND};
// to save subgraph model for npu/xpu/...
std::string subgraph_model_cache_dir_{""};
int device_id_{0};
public:
explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1);
......@@ -142,6 +146,9 @@ class LITE_API ConfigBase {
const std::string& subgraph_model_cache_dir() const {
return subgraph_model_cache_dir_;
}
// set Device ID
void set_device_id(int device_id) { device_id_ = device_id; }
const int get_device_id() const { return device_id_; }
};
/// CxxConfig is the config for the Full feature predictor.
......
......@@ -75,7 +75,8 @@ const std::string& TargetToStr(TargetType target) {
"bm",
"mlu",
"rknpu",
"apu"};
"apu",
"huawei_ascend_npu"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -118,7 +119,8 @@ const std::string& TargetRepr(TargetType target) {
"kBM",
"kMLU",
"kRKNPU",
"kAPU"};
"kAPU",
"kHuaweiAscendNPU"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -163,7 +165,8 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kMLU),
TARGET(kAPU),
TARGET(kRKNPU),
TARGET(kFPGA)});
TARGET(kFPGA),
TARGET(kHuaweiAscendNPU)});
if (target == TARGET(kAny)) {
return valid_set;
}
......
......@@ -57,7 +57,8 @@ enum class TargetType : int {
kMLU = 11,
kRKNPU = 12,
kAPU = 13,
NUM = 14, // number of fields.
kHuaweiAscendNPU = 14,
NUM = 15, // number of fields.
};
enum class PrecisionType : int {
kUnk = 0,
......
......@@ -28,6 +28,7 @@ USE_MIR_PASS(graph_visualize_pass);
USE_MIR_PASS(remove_tf_redundant_ops_pass);
USE_MIR_PASS(lite_conv_bn_fuse_pass);
USE_MIR_PASS(lite_conv_conv_fuse_pass);
USE_MIR_PASS(lite_fc_fuse_pass);
USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
......@@ -47,12 +48,14 @@ USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(multi_stream_analysis_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
USE_MIR_PASS(lite_scale_activation_fuse_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
......
......@@ -191,6 +191,7 @@ void BindLitePlace(py::module *m) {
.value("MLU", TargetType::kMLU)
.value("RKNPU", TargetType::kRKNPU)
.value("APU", TargetType::kAPU)
.value("HUAWEI_ASCEND_NPU", TargetType::kHuaweiAscendNPU)
.value("Any", TargetType::kAny);
// PrecisionType
......
......@@ -10,3 +10,4 @@ add_subdirectory(mlu)
add_subdirectory(bm)
add_subdirectory(apu)
add_subdirectory(rknpu)
add_subdirectory(huawei_ascend_npu)
......@@ -234,7 +234,7 @@ void beam_search(const Tensor *pre_ids,
selected_ids->Resize(dims);
selected_scores->Resize(dims);
if (parent_idx) {
parent_idx->Resize(dims);
parent_idx->Resize({static_cast<int64_t>(num_instances)});
}
auto *selected_ids_data = selected_ids->mutable_data<int64_t>();
auto *selected_scores_data = selected_scores->mutable_data<float>();
......
......@@ -139,6 +139,151 @@ static bool conv_trans_weights_numc(const dtype* din,
}
return true;
}
// for example: m = 4, n = 4
// din = [[0, 1, 2, 3], [4, 5, 6, 7], [8, 9 , 10 ,11], [12, 13, 14, 15]]
// dout = [[0, 4, 8, 12], [1, 5, 9, 13], [2, 6, 10, 14], [3, 7, 11, 15]]
/*
m = 8 n = 8: 0 1 2 3 4 5 6 7 0 8 16 24 32 40 48 56
16 17 18 19 20 21 22 23 2 10 18 26 34 42 50 58
24 25 26 27 28 29 30 31 3 11 19 27 35 43 51 59
32 33 34 35 36 37 38 39 4 12 20 28 36 44 52 60 ...
}
}
*/
template <typename Dtype>
void local_transpose(const Dtype* din, Dtype* dout, int m, int n) {
// n % 4 == 0 && m % 4 == 0
// n * m ==> n * m data trans
int offset_m = m << 2;
const Dtype* din_ptr = din;
Dtype* dout_ptr = dout;
for (int i = 0; i < n; i += 4) {
Dtype* out_ptr0 = dout_ptr;
Dtype* out_ptr1 = dout_ptr + m;
Dtype* out_ptr2 = out_ptr1 + m;
Dtype* out_ptr3 = out_ptr2 + m;
const Dtype* in_ptr0 = din_ptr;
const Dtype* in_ptr1 = din_ptr + m;
const Dtype* in_ptr2 = in_ptr1 + m;
const Dtype* in_ptr3 = in_ptr2 + m;
for (int j = 0; j < m; j += 4) {
float32x4_t vin0 = vld1q_f32(in_ptr0);
float32x4_t vin1 = vld1q_f32(in_ptr1);
float32x4_t vin2 = vld1q_f32(in_ptr2);
float32x4_t vin3 = vld1q_f32(in_ptr3);
// a00 b00 a02 b02 a01 b01 a03 b03
float32x4x2_t tmp0 = vtrnq_f32(vin0, vin1);
// c00 d00 c02 d02 c01 d01 c03 d03
float32x4x2_t tmp2 = vtrnq_f32(vin2, vin3);
in_ptr0 = in_ptr3 + m;
in_ptr1 = in_ptr3 + 2 * m;
float tmp_val1 = tmp0.val[0][2];
float tmp_val2 = tmp0.val[0][3];
tmp0.val[0][2] = tmp2.val[0][0];
tmp0.val[0][3] = tmp2.val[0][1];
float tmp_val3 = tmp0.val[1][2];
float tmp_val4 = tmp0.val[1][3];
tmp2.val[0][0] = tmp_val1;
tmp2.val[0][1] = tmp_val2;
tmp0.val[1][2] = tmp2.val[1][0];
tmp0.val[1][3] = tmp2.val[1][1];
tmp2.val[1][0] = tmp_val3;
tmp2.val[1][1] = tmp_val4;
in_ptr2 = in_ptr1 + m;
in_ptr3 = in_ptr1 + 2 * m;
vst1q_f32(out_ptr0, tmp0.val[0]);
vst1q_f32(out_ptr1, tmp0.val[1]);
out_ptr0 += 4;
out_ptr1 += 4;
vst1q_f32(out_ptr2, tmp2.val[0]);
vst1q_f32(out_ptr3, tmp2.val[1]);
out_ptr2 += 4;
out_ptr3 += 4;
}
dout_ptr += offset_m;
din_ptr += 4;
}
}
template <typename Dtype>
void transpose(const Dtype* din, Dtype* dout, int m, int n) {
// nxm == mxn
// 4x4
int cnt_n = n >> 2;
int remain_n = n & 3;
int cnt_m = m >> 2;
int remain_m = m & 3;
int nn_num = n << 2; // n * 4
int mm_num = m << 2; // m * 4
for (int x = 0; x < cnt_n; x++) {
const Dtype* din_ptr0 = din + x * mm_num;
const Dtype* din_ptr1 = din_ptr0 + m;
const Dtype* din_ptr2 = din_ptr1 + m;
const Dtype* din_ptr3 = din_ptr2 + m;
Dtype* dout_ptr0 = dout + x * 4;
for (int y = 0; y < cnt_m; y++) {
float32x4_t din0 = vld1q_f32(din_ptr0); // a00 a01 a02 a03
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
Dtype* dout_ptr1 = dout_ptr0 + n;
Dtype* dout_ptr2 = dout_ptr1 + n;
Dtype* dout_ptr3 = dout_ptr2 + n;
// a00 b00 a02 b02 a01 b01 a03 b03
float32x4x2_t tmp0 = vtrnq_f32(din0, din1);
// c00 d00 c02 d02 c01 d01 c03 d03
float32x4x2_t tmp2 = vtrnq_f32(din2, din3);
din_ptr0 += 4;
din_ptr1 += 4;
// a00 b00 c00 d00 a02 b02 c02 d02
// a01 b01 c01 d01 a03 b03 c03 d03
float tmp_val1 = tmp0.val[0][2];
float tmp_val2 = tmp0.val[0][3];
tmp0.val[0][2] = tmp2.val[0][0];
tmp0.val[0][3] = tmp2.val[0][1];
float tmp_val3 = tmp0.val[1][2];
float tmp_val4 = tmp0.val[1][3];
tmp2.val[0][0] = tmp_val1;
tmp2.val[0][1] = tmp_val2;
tmp0.val[1][2] = tmp2.val[1][0];
tmp0.val[1][3] = tmp2.val[1][1];
tmp2.val[1][0] = tmp_val3;
tmp2.val[1][1] = tmp_val4;
din_ptr2 += 4;
din_ptr3 += 4;
vst1q_f32(dout_ptr0, tmp0.val[0]);
vst1q_f32(dout_ptr1, tmp0.val[1]);
dout_ptr0 += nn_num;
vst1q_f32(dout_ptr2, tmp2.val[0]);
vst1q_f32(dout_ptr3, tmp2.val[1]);
}
for (int y = 0; y < remain_m; y++) {
*dout_ptr0++ = *din_ptr0++;
*dout_ptr0++ = *din_ptr1++;
*dout_ptr0++ = *din_ptr2++;
*dout_ptr0++ = *din_ptr3++;
}
}
const Dtype* din_ptr0 = din + cnt_n * mm_num;
dout = dout + cnt_n * 4;
for (int x = 0; x < remain_n; x++) {
Dtype* dout_ptr0 = dout + x * 4;
for (int y = 0; y < cnt_m; y++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
Dtype* dout_ptr1 = dout_ptr0 + n;
Dtype* dout_ptr2 = dout_ptr1 + n;
Dtype* dout_ptr3 = dout_ptr2 + n;
din_ptr0 += 4;
*dout_ptr0 = din0[0];
*dout_ptr1 = din0[1];
dout_ptr0 += nn_num;
*dout_ptr2 = din0[2];
*dout_ptr3 = din0[3];
}
for (int y = 0; y < remain_m; y++) {
*dout_ptr0++ = *din_ptr0++;
}
}
}
/*preprocessing inputs
* input din: [1, chin, he-hs, we - ws] --> outputs dout: [n, chin, 1, we - ws]
* n = he - hs
......
......@@ -747,6 +747,16 @@ void elementwise_mul<int>(const int* dinx,
}
}
template <>
void elementwise_mul<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
dout[i] = dinx[i] * diny[i];
}
}
template <>
void elementwise_mul_relu<float>(const float* dinx,
const float* diny,
......@@ -801,6 +811,17 @@ void elementwise_mul_relu<float>(const float* dinx,
}
}
template <>
void elementwise_mul_relu<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int num) {
for (int i = 0; i < num; i++) {
int64_t tmp = dinx[i] * diny[i];
dout[i] = tmp > 0 ? tmp : 0;
}
}
template <>
void elementwise_mul_broadcast<float>(const float* dinx,
const float* diny,
......@@ -935,6 +956,29 @@ void elementwise_mul_broadcast<int>(const int* dinx,
}
}
template <>
void elementwise_mul_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* dinx_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int k = 0; k < num; ++k) {
*dout_ptr = *dinx_ptr * diny_data;
dout_ptr++;
dinx_ptr++;
}
}
}
}
template <>
void elementwise_mul_relu_broadcast<float>(const float* dinx,
const float* diny,
......@@ -1014,6 +1058,30 @@ void elementwise_mul_relu_broadcast<float>(const float* dinx,
}
}
template <>
void elementwise_mul_relu_broadcast<int64_t>(const int64_t* dinx,
const int64_t* diny,
int64_t* dout,
int batch,
int channels,
int num) {
#pragma omp parallel for collapse(2)
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
int offset = (i * channels + j) * num;
const int64_t* dinx_ptr = dinx + offset;
const int64_t diny_data = diny[j];
int64_t* dout_ptr = dout + offset;
for (int k = 0; k < num; ++k) {
int64_t tmp = *dinx_ptr * diny_data;
*dout_ptr = tmp > 0 ? tmp : 0;
dout_ptr++;
dinx_ptr++;
}
}
}
}
template <>
void elementwise_max<float>(const float* dinx,
const float* diny,
......
......@@ -2044,7 +2044,7 @@ void pooling3x3s1p0_avg(const float* din,
} else {
if (pad_bottom > 1) {
coef_h = 1.f / 3;
} else if (pad_bottom = 1) {
} else if (pad_bottom == 1) {
coef_h = 0.5f;
} else {
coef_h = 1.f;
......
......@@ -21,7 +21,7 @@ namespace lite {
namespace arm {
namespace math {
const int MALLOC_ALIGN = 64;
const int MALLOC_ALIGN = 16;
void* fast_malloc(size_t size) {
size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
......
......@@ -46,11 +46,60 @@ void seq_pool_sum<float>(const float* din,
memcpy(dout_ptr, din_ptr, width * sizeof(float));
din_ptr += width;
height = height - 1;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; ++w) {
dout_ptr[w] += din_ptr[w];
int cnt_w = width >> 2;
int remain_w = width & 3;
int cnt_h = height >> 2;
int remain_h = height & 3;
int stride = width << 2;
for (int w = 0; w < cnt_w; w++) {
const float* din_ptr0 = din_ptr + w * 4;
float32x4_t dout_val = vld1q_f32(dout_ptr);
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
dout_val = vaddq_f32(din0, dout_val);
float32x4_t tmp = vaddq_f32(din1, din2);
din_ptr0 += stride;
din_ptr1 += stride;
dout_val = vaddq_f32(din3, dout_val);
din_ptr2 += stride;
din_ptr3 += stride;
dout_val = vaddq_f32(tmp, dout_val);
}
din_ptr += width;
for (int h = 0; h < remain_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
dout_val = vaddq_f32(din0, dout_val);
din_ptr0 += width;
}
vst1q_f32(dout_ptr, dout_val);
dout_ptr += 4;
}
const float* din_ptr00 = din_ptr + cnt_w * 4;
for (int w = 0; w < remain_w; w++) {
const float* din_ptr0 = din_ptr00 + w;
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
*dout_ptr += din_ptr0[0];
float tmp = din_ptr1[0] + din_ptr2[0];
din_ptr0 += stride;
din_ptr1 += stride;
*dout_ptr += din_ptr3[0];
din_ptr2 += stride;
din_ptr3 += stride;
*dout_ptr += tmp;
}
for (int h = 0; h < remain_h; h++) {
*dout_ptr += din_ptr0[0];
din_ptr0 += width;
}
dout_ptr++;
}
}
}
......@@ -144,12 +193,62 @@ void seq_pool_max<float>(const float* din,
} else {
memcpy(dout_ptr, din_ptr, width * sizeof(float));
din_ptr += width;
int remain_h = height - 1;
height = height - 1;
int cnt_w = width >> 2;
int remain_w = width & 3;
int cnt_h = height >> 2;
int remain_h = height & 3;
int stride = width << 2;
for (int w = 0; w < cnt_w; w++) {
const float* din_ptr0 = din_ptr + w * 4;
float32x4_t dout_val = vld1q_f32(dout_ptr);
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
float32x4_t din0 = vld1q_f32(din_ptr0);
float32x4_t din1 = vld1q_f32(din_ptr1);
float32x4_t din2 = vld1q_f32(din_ptr2);
float32x4_t din3 = vld1q_f32(din_ptr3);
dout_val = vmaxq_f32(din0, dout_val);
float32x4_t tmp = vmaxq_f32(din1, din2);
din_ptr0 += stride;
din_ptr1 += stride;
dout_val = vmaxq_f32(din3, dout_val);
din_ptr2 += stride;
din_ptr3 += stride;
dout_val = vmaxq_f32(tmp, dout_val);
}
for (int h = 0; h < remain_h; h++) {
for (int w = 0; w < width; w++) {
dout_ptr[w] = std::max(dout_ptr[w], din_ptr[w]);
float32x4_t din0 = vld1q_f32(din_ptr0);
dout_val = vmaxq_f32(din0, dout_val);
din_ptr0 += width;
}
vst1q_f32(dout_ptr, dout_val);
dout_ptr += 4;
}
const float* din_ptr00 = din_ptr + cnt_w * 4;
for (int w = 0; w < remain_w; w++) {
const float* din_ptr0 = din_ptr00 + w;
const float* din_ptr1 = din_ptr0 + width;
const float* din_ptr2 = din_ptr1 + width;
const float* din_ptr3 = din_ptr2 + width;
for (int h = 0; h < cnt_h; h++) {
*dout_ptr += din_ptr0[0];
*dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
float tmp = std::max(din_ptr1[0], din_ptr2[0]);
din_ptr0 += stride;
din_ptr1 += stride;
*dout_ptr = std::max(*dout_ptr, din_ptr3[0]);
din_ptr2 += stride;
din_ptr3 += stride;
*dout_ptr = std::max(*dout_ptr, tmp);
}
din_ptr += width;
for (int h = 0; h < remain_h; h++) {
*dout_ptr = std::max(*dout_ptr, din_ptr0[0]);
din_ptr0 += width;
}
dout_ptr++;
}
}
}
......
......@@ -11,10 +11,13 @@ nv_library(cuda_transpose SRCS transpose.cu DEPS ${cuda_static_deps})
nv_library(cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans ${cuda_static_deps})
nv_library(cuda_elementwise SRCS elementwise.cu DEPS ${cuda_static_deps})
nv_library(cudnn_pool SRCS cudnn_pool.cc DEPS ${cuda_static_deps})
nv_library(cuda_gru_forward SRCS gru_forward.cu DEPS cuda_activation ${cuda_static_deps})
nv_library(cuda_sequence2batch SRCS sequence2batch.cu DEPS ${cuda_static_deps})
nv_library(cuda_gemm SRCS gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_batched_gemm SRCS batched_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_strided_gemm SRCS strided_gemm.cc DEPS ${cuda_static_deps})
nv_library(cuda_sequence_padding SRCS sequence_padding.cu DEPS ${cuda_static_deps})
nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
set (
math_cuda
......@@ -25,10 +28,13 @@ set (
cuda_transpose
cuda_elementwise
cudnn_pool
cuda_gru_forward
cuda_sequence2batch
cuda_gemm
cuda_batched_gemm
cuda_strided_gemm
cuda_sequence_padding
cuda_bias
)
set(math_cuda "${math_cuda}" CACHE GLOBAL "math cuda")
......@@ -13,6 +13,7 @@
// limitations under the License.
#include <iostream>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/activation.h"
#include "lite/backends/cuda/math/utils.h"
......@@ -21,6 +22,20 @@ namespace lite {
namespace cuda {
namespace math {
ActivationType GetActiveType(const std::string& act) {
if (act == "sigmoid") {
return kSigmoid;
} else if (act == "relu") {
return kReLU;
} else if (act == "tanh") {
return kTanh;
} else if (act == "identify") {
return kIdentity;
} else {
LOG(FATAL) << "not supported activation: " << act;
}
}
template <typename T>
__global__ void relu_kernel(const int num,
const float alpha,
......@@ -470,6 +485,76 @@ template void relu(int, const half*, half*, float, cudaStream_t);
template void bias_relu(
int, const float*, const float* bias, float*, float, cudaStream_t);
// ------------- sigmoid -------------
template <typename T>
__global__ void sigmoid_kernel(const int num, const T* in, T* out) {
CUDA_KERNEL_LOOP(i, num) {
#if __CUDA_ARCH__ >= 350
out[i] = static_cast<T>(1.0f) /
(static_cast<T>(1.0f) + expf(-1 * __ldg(in + i)));
#else
out[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + expf(-in[i]));
#endif
}
}
template <>
__global__ void sigmoid_kernel(const int num, const half* in, half* out) {
CUDA_KERNEL_LOOP(i, num) {
half tmp = __float2half(1.0f);
#if __CUDA_ARCH__ >= 530
out[i] = __hdiv(
tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.0f), __ldg(in + i)))));
#else
out[i] = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i]))));
#endif
}
}
template <>
__global__ void sigmoid_kernel(const int num, const half2* in, half2* out) {
CUDA_KERNEL_LOOP(i, num) {
half2 tmp = __floats2half2_rn(1.0f, 1.0f);
#if __CUDA_ARCH__ >= 530
out[i] = __h2div(tmp,
__hadd2(tmp,
h2exp(__hmul2(__floats2half2_rn(-1.0f, -1.0f),
__ldg(in + i)))));
#else
out[i].x = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].x))));
out[i].y = __float2half(1.0f / (1.0f + expf(-1 * __half2float(in[i].y))));
#endif
}
}
template <typename T>
void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream) {
sigmoid_kernel<T><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
num, din, dout);
CUDA_POST_KERNEL_CHECK;
}
template <>
void sigmoid(const int num, const half* din, half* dout, cudaStream_t stream) {
if (num % 2 == 0) {
const half2* din2 = reinterpret_cast<const half2*>(din);
half2* dout2 = reinterpret_cast<half2*>(dout);
sigmoid_kernel<
half2><<<CUDA_GET_BLOCKS(num / 2), CUDA_NUM_THREADS, 0, stream>>>(
num / 2, din2, dout2);
} else {
sigmoid_kernel<half><<<CUDA_GET_BLOCKS(num), CUDA_NUM_THREADS, 0, stream>>>(
num, din, dout);
}
CUDA_POST_KERNEL_CHECK;
}
template void sigmoid(const int num,
const float* din,
float* dout,
cudaStream_t stream);
} // namespace math
} // namespace cuda
} // namespace lite
......
......@@ -17,11 +17,22 @@
#include <cuda_runtime.h>
#include <string>
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
enum ActivationType {
kSigmoid,
kReLU,
kTanh,
kIdentity,
};
ActivationType GetActiveType(const std::string& act);
// fp32 and half
template <typename T>
void relu(int num, const T* din, T* dout, float alpha, cudaStream_t stream);
......@@ -72,6 +83,9 @@ void bias_int8_nhwc(int num,
const void* scale,
cudaStream_t stream);
template <typename T>
void sigmoid(const int num, const T* din, T* dout, cudaStream_t stream);
} // namespace math
} // namespace cuda
} // namespace lite
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/bias.h"
#include <iostream>
#include "lite/backends/cuda/cuda_utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
__global__ void RowwiseAddKernel(
const T* a, const T* b, T* c, int width, int num) {
CUDA_KERNEL_LOOP(i, num) {
int h = i / width;
int w = i - h * width;
c[i] = a[i] + b[w];
}
}
template <>
__global__ void RowwiseAddKernel(
const half* a, const half* b, half* c, int width, int num) {
CUDA_KERNEL_LOOP(i, num) {
int h = i / width;
int w = i - h * width;
c[i] = __hadd(a[i], b[w]);
}
}
template <typename T>
void RowwiseAdd<T>::operator()(const T* input,
const T* bias,
T* output,
const int width,
const int count,
const cudaStream_t& stream) {
RowwiseAddKernel<T><<<CUDA_GET_BLOCKS(count), CUDA_NUM_THREADS, 0, stream>>>(
input, bias, output, width, count);
CUDA_POST_KERNEL_CHECK;
}
template struct RowwiseAdd<float>;
template struct RowwiseAdd<half>;
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include "lite/backends/cuda/cuda_utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
struct RowwiseAdd {
void operator()(const T* input,
const T* bias,
T* output,
const int width,
const int count,
const cudaStream_t& stream);
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <iostream>
#include "lite/backends/cuda/math/gru_forward.h"
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
/*
* threads(frame_per_block, batch_per_block)
* grid(frame_blocks, batch_blocks)
*/
template <typename T>
__global__ void GruForwardResetOutput(
T* gate_value,
T* reset_output_value,
T* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) return;
gate_value += batch_idx * 3 * frame_size;
reset_output_value += batch_idx * frame_size;
}
T prev_out = 0;
T reset_out_val;
T update_gate_value = gate_value[frame_idx + frame_size * 0];
T reset_gate_value = gate_value[frame_idx + frame_size * 1];
if (prev_output_value) {
if (is_batch) {
prev_output_value += batch_idx * frame_size;
}
prev_out = prev_output_value[frame_idx];
}
if (active_gate == lite::cuda::math::ActivationType::kSigmoid) {
update_gate_value = Sigmoid(update_gate_value);
reset_gate_value = Sigmoid(reset_gate_value);
} else if (active_gate == lite::cuda::math::ActivationType::kReLU) {
update_gate_value = ReLU(update_gate_value);
reset_gate_value = ReLU(reset_gate_value);
} else if (active_gate == lite::cuda::math::ActivationType::kTanh) {
update_gate_value = Tanh(update_gate_value);
reset_gate_value = Tanh(reset_gate_value);
}
reset_out_val = prev_out * reset_gate_value;
gate_value[frame_idx + frame_size * 0] = update_gate_value;
gate_value[frame_idx + frame_size * 1] = reset_gate_value;
reset_output_value[frame_idx] = reset_out_val;
}
template <>
__global__ void GruForwardResetOutput(
half* gate_value,
half* reset_output_value,
half* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) return;
gate_value += batch_idx * 3 * frame_size;
reset_output_value += batch_idx * frame_size;
}
half prev_out = 0;
half reset_out_val;
half update_gate_value = gate_value[frame_idx + frame_size * 0];
half reset_gate_value = gate_value[frame_idx + frame_size * 1];
if (prev_output_value) {
if (is_batch) {
prev_output_value += batch_idx * frame_size;
}
prev_out = prev_output_value[frame_idx];
}
if (active_gate == ActivationType::kSigmoid) {
update_gate_value = Sigmoid(update_gate_value);
reset_gate_value = Sigmoid(reset_gate_value);
} else if (active_gate == ActivationType::kReLU) {
update_gate_value = ReLU(update_gate_value);
reset_gate_value = ReLU(reset_gate_value);
} else if (active_gate == ActivationType::kTanh) {
update_gate_value = Tanh(update_gate_value);
reset_gate_value = Tanh(reset_gate_value);
}
#if __CUDA_ARCH__ >= 530
reset_out_val = __hmul(prev_out, reset_gate_value);
#else
reset_out_val =
__float2half(__half2float(prev_out) * __half2float(reset_gate_value));
#endif
gate_value[frame_idx + frame_size * 0] = update_gate_value;
gate_value[frame_idx + frame_size * 1] = reset_gate_value;
reset_output_value[frame_idx] = reset_out_val;
}
/*
* threads(frame_per_block, batch_per_block)
* grid(frame_blocks, batch_blocks)
*/
template <typename T>
__global__ void GruForwardFinalOutput(
T* gate_value,
T* prev_output_value,
T* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) {
return;
}
gate_value += batch_idx * 3 * frame_size;
output_value += batch_idx * frame_size;
}
T output;
T prev_out = 0;
T update_gate_value = gate_value[frame_idx + frame_size * 0];
T state_frame_value = gate_value[frame_idx + frame_size * 2];
if (prev_output_value) {
if (is_batch) prev_output_value += batch_idx * frame_size;
prev_out = prev_output_value[frame_idx];
}
if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
state_frame_value = Sigmoid(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kReLU) {
state_frame_value = ReLU(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kTanh) {
state_frame_value = Tanh(state_frame_value);
}
if (origin_mode) {
output = update_gate_value * prev_out + state_frame_value -
update_gate_value * state_frame_value;
} else {
output = prev_out - update_gate_value * prev_out +
update_gate_value * state_frame_value;
}
gate_value[frame_idx + frame_size * 2] = state_frame_value;
output_value[frame_idx] = output;
}
template <>
__global__ void GruForwardFinalOutput(
half* gate_value,
half* prev_output_value,
half* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch) {
const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (frame_idx >= frame_size) return;
int batch_idx = 0;
if (is_batch) {
batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
if (batch_idx >= batch_size) {
return;
}
gate_value += batch_idx * 3 * frame_size;
output_value += batch_idx * frame_size;
}
half output;
half prev_out = 0;
half update_gate_value = gate_value[frame_idx + frame_size * 0];
half state_frame_value = gate_value[frame_idx + frame_size * 2];
if (prev_output_value) {
if (is_batch) prev_output_value += batch_idx * frame_size;
prev_out = prev_output_value[frame_idx];
}
if (active_node == lite::cuda::math::ActivationType::kSigmoid) {
state_frame_value = Sigmoid(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kReLU) {
state_frame_value = ReLU(state_frame_value);
} else if (active_node == lite::cuda::math::ActivationType::kTanh) {
state_frame_value = Tanh(state_frame_value);
}
if (origin_mode) {
#if __CUDA_ARCH__ >= 530
output =
__hsub(__hadd(__hmul(update_gate_value, prev_out), state_frame_value),
__hmul(update_gate_value, state_frame_value));
#else
output = __float2half(
__half2float(update_gate_value) * __half2float(prev_out) +
__half2float(state_frame_value) -
__half2float(update_gate_value) * __half2float(state_frame_value));
#endif
} else {
#if __CUDA_ARCH__ >= 530
output = prev_out - update_gate_value * prev_out +
update_gate_value * state_frame_value;
output = __hadd(__hsub(prev_out, __hmul(update_gate_value, prev_out)),
__hmul(update_gate_value, state_frame_value));
#else
output = __float2half(
__half2float(prev_out) -
__half2float(update_gate_value) * __half2float(prev_out) +
__half2float(update_gate_value) * __half2float(state_frame_value));
#endif
}
gate_value[frame_idx + frame_size * 2] = state_frame_value;
output_value[frame_idx] = output;
}
template __global__ void GruForwardFinalOutput<float>(
float* gate_value,
float* prev_output_value,
float* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch);
template __global__ void GruForwardResetOutput<float>(
float* gate_value,
float* reset_output_value,
float* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch);
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/activation.h"
#include "lite/core/context.h"
#include "lite/core/target_wrapper.h"
#include "lite/operators/op_params.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
template <typename Dtype>
inline __device__ Dtype Sigmoid(const Dtype a) {
const Dtype min = SIGMOID_THRESHOLD_MIN;
const Dtype max = SIGMOID_THRESHOLD_MAX;
Dtype tmp = (a < min) ? min : ((a > max) ? max : a);
return static_cast<Dtype>(1.0) / (static_cast<Dtype>(1.0) + expf(-tmp));
}
template <>
inline __device__ half Sigmoid(const half a) {
#if __CUDA_ARCH__ >= 530
const half tmp = __float2half(1.0f);
return __hdiv(tmp, __hadd(tmp, hexp(__hmul(__float2half(-1.f), a))));
#else
return __float2half(1.0f / (expf(__half2float(a) * -1) + 1.0f));
#endif
}
template <typename Dtype>
inline __device__ Dtype ReLU(const Dtype a) {
return a > static_cast<Dtype>(0.f) ? a : static_cast<Dtype>(0.f);
}
template <>
inline __device__ half ReLU(const half a) {
const half tmp = __float2half(0.f);
#if __CUDA_ARCH__ >= 530
return __hgt(a, tmp) ? a : tmp;
#else
return __float2half(__half2float(a) > 0.f ? __half2float(a) : 0.f);
#endif
}
template <typename Dtype>
inline __device__ Dtype Tanh(const Dtype a) {
Dtype tmp = static_cast<Dtype>(-2.0) * a;
tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
return (static_cast<Dtype>(2.0) / (static_cast<Dtype>(1.0) + expf(tmp))) -
static_cast<Dtype>(1.0);
}
template <>
inline __device__ half Tanh(const half a) {
#if __CUDA_ARCH__ >= 530
half tmp = __float2half(1.0f);
half numerator = __hmul(__float2half(-2.0f), a);
return __hsub(__hdiv(__float2half(2.0f), __hadd(tmp, hexp(numerator))), tmp);
#else
float tmp = -2.0f * __half2float(a);
return __float2half(2.0f / (1.0f + expf(tmp)) - 1.0f);
#endif
}
template <typename T>
__global__ void GruForwardResetOutput(
T* gate_value,
T* reset_output_value,
T* prev_output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_gate,
bool is_batch);
template <typename T>
__global__ void GruForwardFinalOutput(
T* gate_value,
T* prev_output_value,
T* output_value,
int frame_size,
int batch_size,
lite::cuda::math::ActivationType active_node,
bool origin_mode,
bool is_batch);
/*
* threads(tile_size, 1)
* grids(frame_blocks, 1)
*/
template <class T, int TiledSize>
__global__ void FastCollectiveGruGate(T* gate_value,
T* prev_output_value,
T* gate_weight,
T* reset_output,
int frame_size,
ActivationType active_node) {
T xt_0 = 0.0f;
T a0 = 0.0f;
T c0 = 0.0f;
T b0[TiledSize];
int col = blockIdx.x * blockDim.x + threadIdx.x;
int tiled_mask = ((1 << TiledSize) - 1);
// tiled matrix multiply using register shift, faster than sm.
if (prev_output_value) {
for (int k = 0; k < (((frame_size - 1) / TiledSize) + 1); ++k) {
a0 = 0;
if ((threadIdx.x + k * TiledSize) < frame_size) {
a0 = prev_output_value[threadIdx.x + (k * TiledSize)];
}
for (int i = 0; i < TiledSize; ++i) {
if (col < frame_size * 2 && (i + k * TiledSize) < frame_size) {
b0[i] = gate_weight[(i + k * TiledSize) * frame_size * 2 + col];
}
}
for (int i = 0; i < TiledSize; ++i) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
#else
c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
#endif
}
}
}
__syncthreads();
if (col < frame_size * 2) {
xt_0 = gate_value[col];
c0 += xt_0;
if (active_node == ActivationType::kSigmoid) {
c0 = Sigmoid(c0);
} else if (active_node == ActivationType::kReLU) {
c0 = ReLU(c0);
} else if (active_node == ActivationType::kTanh) {
c0 = Tanh(c0);
}
gate_value[col] = c0;
if (frame_size <= col && col < frame_size * 2) {
T htp_0 = 0.0;
if (prev_output_value) {
htp_0 = prev_output_value[col - frame_size];
}
reset_output[col - frame_size] = c0 * htp_0;
} else if (col < frame_size) {
gate_value[col] = c0;
}
}
}
template <class T, int TiledSize>
__global__ void FastCollectiveGruOut(T* gate_weight,
T* prev_out_value,
T* output_value,
T* gate_value,
T* reset_value,
int frame_size,
ActivationType active_node,
bool origin_mode) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
T a0 = 0.0f;
T b0[TiledSize];
T c0 = 0.0f;
int tiled_mask = ((1 << TiledSize) - 1);
if (prev_out_value) {
for (int k = 0; k < ((frame_size - 1) / TiledSize + 1); ++k) {
a0 = 0;
if ((threadIdx.x + k * TiledSize) < frame_size) {
a0 = reset_value[threadIdx.x + k * TiledSize];
}
for (int i = 0; i < TiledSize; ++i) {
if (col < frame_size && (i + k * TiledSize) < frame_size) {
b0[i] = gate_weight[(i + k * TiledSize) * frame_size + col];
}
}
for (int i = 0; i < TiledSize; ++i) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
c0 = c0 + __shfl_sync(tiled_mask, a0, i, TiledSize) * b0[i];
#else
c0 = c0 + __shfl(a0, i, TiledSize) * b0[i];
#endif
}
}
}
__syncthreads();
if (col < frame_size) {
T xt_0 = gate_value[col + 2 * frame_size];
T gta_0 = gate_value[col];
T htp_0 = 0;
if (prev_out_value) {
htp_0 = prev_out_value[col];
}
c0 += xt_0;
if (active_node == ActivationType::kSigmoid) {
c0 = Sigmoid(c0);
} else if (active_node == ActivationType::kReLU) {
c0 = ReLU(c0);
} else if (active_node == ActivationType::kTanh) {
c0 = Tanh(c0);
}
gate_value[col + 2 * frame_size] = c0;
if (origin_mode) {
output_value[col] = htp_0 * gta_0 + (1 - gta_0) * c0;
} else {
output_value[col] = c0 * gta_0 + (1 - gta_0) * htp_0;
}
}
}
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
......@@ -22,10 +22,6 @@ namespace lite {
namespace cuda {
namespace math {
#define CUDA_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
template <typename T>
__global__ void scale_kernel(int count,
const T* in_data,
......@@ -48,7 +44,6 @@ __global__ void scale_kernel(int count,
template <typename T>
__global__ void scale_kernel(
int count, const T* in_data, T* out_data, const T scale, const T bias) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
CUDA_KERNEL_LOOP(tid, count) { out_data[tid] = scale * in_data[tid] + bias; }
}
......@@ -133,12 +128,11 @@ void fp32_scale_nhwc(int num,
}
template <typename T>
void scale(int num, const T* in, T* out, T scale, cudaStream_t stream, T bias) {
void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream) {
int thread = 256;
int block = (num + thread - 1) / thread;
scale_kernel<<<block, thread, 0, stream>>>(num, in, out, scale, bias);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template <typename T>
......@@ -146,11 +140,10 @@ void scale(int num, const T* in, T* out, T scale, T bias) {
int thread = 256;
int block = (num + thread - 1) / thread;
scale_kernel<<<block, thread>>>(num, in, out, scale, bias);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) std::cout << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template void scale(int num, const float*, float*, float, cudaStream_t, float);
template void scale(int num, const float*, float*, float, float, cudaStream_t);
template void scale(int num, const float*, float*, float, float);
} // namespace math
......
......@@ -32,8 +32,7 @@ void fp32_scale_nhwc(int num,
cudaStream_t stream);
template <typename T>
void scale(
int num, const T* in, T* out, T scale, cudaStream_t stream, T bias = 0);
void scale(int num, const T* in, T* out, T scale, T bias, cudaStream_t stream);
template <typename T>
void scale(int num, const T* in, T* out, T scale, T bias = 0);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/math/sequence2batch.h"
#include "lite/backends/cuda/math/utils.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
__global__ void CopyMatrixRowsKernel(const T* src,
T* dst,
const uint64_t* index,
int height,
int width,
bool is_src_index) {
int idx = threadIdx.x;
int idy = threadIdx.y;
int row_id = blockDim.y * blockIdx.x + idy;
if (row_id < height) {
int src_idx = is_src_index ? index[row_id] : row_id;
int dst_idx = is_src_index ? row_id : index[row_id];
const T* src_data = src + src_idx * width;
T* dst_data = dst + dst_idx * width;
for (int i = idx; i < width; i += blockDim.x) {
dst_data[i] = src_data[i];
}
}
}
template <typename T>
void CopyMatrixRowsFunctor<T>::operator()(
const lite::Tensor& src,
lite::Tensor* dst,
const std::vector<uint64_t>& index_lod,
bool is_src_index,
const cudaStream_t& stream) {
auto src_dims = src.dims();
auto dst_dims = dst->dims();
CHECK_EQ(src_dims.size(), 2) << "The src must be matrix with rank 2.";
CHECK_EQ(dst_dims.size(), 2) << "The dst must be matrix with rank 2.";
CHECK_EQ(src_dims[1], dst_dims[1])
<< "The width of src and dst must be same.";
int height = dst_dims[0];
int width = dst_dims[1];
const auto* src_data = src.data<T>();
auto* dst_data = dst->template mutable_data<T>(TARGET(kCUDA));
index_tensor_.Resize({static_cast<int64_t>(index_lod.size())});
auto* index_tensor_data = index_tensor_.mutable_data<uint64_t>(TARGET(kCUDA));
TargetWrapperCuda::MemcpyAsync(index_tensor_data,
index_lod.data(),
sizeof(uint64_t) * index_lod.size(),
IoDirection::HtoD,
stream);
dim3 threads(128, 8);
dim3 grids((height + threads.y - 1) / threads.y);
CopyMatrixRowsKernel<T><<<grids, threads, 0, stream>>>(
src_data, dst_data, index_tensor_data, height, width, is_src_index);
CUDA_POST_KERNEL_CHECK;
}
template class CopyMatrixRowsFunctor<float>;
template class CopyMatrixRowsFunctor<half>;
template class LoDTensor2BatchFunctor<float>;
template class LoDTensor2BatchFunctor<half>;
template class Batch2LoDTensorFunctor<float>;
template class Batch2LoDTensorFunctor<half>;
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include <algorithm>
#include <string>
#include <vector>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/context.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace cuda {
namespace math {
template <typename T>
class CopyMatrixRowsFunctor {
public:
// If is_src_index is true, copy the indexed rows of input src to the output
// dst. If is_src_index is false, copy the input src to the indexed of output
// dst. The indexes rows are based on the input index.
void operator()(const lite::Tensor& src,
lite::Tensor* dst,
const std::vector<uint64_t>& index_lod,
bool is_src_index,
const cudaStream_t& stream);
private:
lite::Tensor index_tensor_;
};
template <typename T>
class LoDTensor2BatchFunctor {
// Calculate the length of each sequence and
// sort sequence index by the length.
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
struct SeqInfo {
SeqInfo(size_t start_val, size_t len_val, size_t seq_val)
: start(start_val), length(len_val), seq_idx(seq_val) {}
size_t start;
size_t length;
size_t seq_idx;
};
public:
void operator()(const lite::Tensor& lod_tensor,
lite::Tensor* batch_tensor,
bool is_reverse,
const cudaStream_t& stream) const {
auto lods = lod_tensor.lod();
CHECK_EQ(lods.size(), 1UL) << "Only support one level sequence now.";
const auto& lod = lods[0];
std::vector<SeqInfo> seq_info;
for (int seq_id = 0; seq_id < static_cast<int>(lod.size()) - 1; ++seq_id) {
size_t length = lod[seq_id + 1] - lod[seq_id];
seq_info.emplace_back(lod[seq_id], length, seq_id);
}
std::sort(seq_info.begin(), seq_info.end(), [](SeqInfo a, SeqInfo b) {
return a.length > b.length;
});
// Calculate the start position of each batch.
// example: sequences = {s0, s1, s2}
// s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
// max_seqlen = 5,
// batchIndex = {b0, b1, b2, b3, b4}
// b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
// batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
// batch_start_positions[0] = 0
// batch_start_positions[1] = len(b0)
// batch_start_positions[2] = len(b0) + len(b1)
// ...
// seq2batch_idx[12] = {4, 0, 9,
// 5, 1, 10,
// 6, 2, 11,
// 7, 3,
// 8}
// seq_order = {1, 0, 2}, the sort order.
// where 1 is the second sequence,
// 0 is the first sequence,
// 2 is the third sequence.
LoD batch_lods;
batch_lods.emplace_back(std::vector<uint64_t>{0});
batch_lods.emplace_back(std::vector<uint64_t>{0});
batch_lods.emplace_back(std::vector<uint64_t>{0});
// batch_lods[0] is the start positions for batch LoDTensor
size_t max_seqlen = seq_info[0].length;
batch_lods[0].resize(max_seqlen + 1);
// batch_lods[1] is the raw index in the input LoDTensor
batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
// batch_lods[2] is the sort order for the input LoDTensor.
batch_lods[2].resize(seq_info.size());
auto* batch_starts = batch_lods[0].data();
auto* seq2batch_idx = batch_lods[1].data();
batch_starts[0] = 0;
for (size_t n = 0; n < max_seqlen; ++n) {
size_t batch_id = batch_starts[n];
for (size_t i = 0; i < seq_info.size(); ++i) {
size_t seq_len = seq_info[i].length;
size_t start = seq_info[i].start;
if (n < seq_len) {
seq2batch_idx[batch_id] =
is_reverse ? start + seq_len - 1 - n : start + n;
++batch_id;
} else {
break;
}
}
batch_starts[n + 1] = batch_id;
}
auto* seq_order = batch_lods[2].data();
for (size_t i = 0; i < seq_info.size(); ++i) {
seq_order[i] = seq_info[i].seq_idx;
}
batch_tensor->set_lod(batch_lods);
lite::cuda::math::CopyMatrixRowsFunctor<T> to_batch;
to_batch(lod_tensor, batch_tensor, batch_lods[1], true, stream);
CUDA_POST_KERNEL_CHECK;
}
};
template <typename T>
class Batch2LoDTensorFunctor {
public:
void operator()(const lite::Tensor& batch_tensor,
lite::Tensor* lod_tensor,
const cudaStream_t& stream) {
auto in_lod = batch_tensor.lod();
CHECK_GT(in_lod.size(), 2UL) << "The LoD of LoDTensor should include at "
"least 2-level sequence infomation.";
CHECK_EQ(in_lod[1].size(), static_cast<size_t>(lod_tensor->dims()[0]))
<< "The LoD information should be consistent with the dims.";
lite::cuda::math::CopyMatrixRowsFunctor<T> to_seq;
to_seq(batch_tensor, lod_tensor, in_lod[1], false, stream);
CUDA_POST_KERNEL_CHECK;
}
};
} // namespace math
} // namespace cuda
} // namespace lite
} // namespace paddle
......@@ -86,8 +86,7 @@ void SequencePadding(T* pad_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template <typename T>
......@@ -120,8 +119,7 @@ void SequenceUnpadding(T* seq_data,
seq_num,
pad_seq_len,
step_width);
cudaError_t error = cudaGetLastError();
if (error != cudaSuccess) LOG(ERROR) << cudaGetErrorString(error);
CUDA_POST_KERNEL_CHECK;
}
template void SequencePadding(float* pad_data,
......
......@@ -15,6 +15,7 @@
#pragma once
#include <cuda.h>
#include <cuda_runtime.h>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/target_wrapper.h"
namespace paddle {
......@@ -31,6 +32,16 @@ class TargetWrapper<TARGET(kCUDA)> {
static size_t num_devices();
static size_t maximum_stream() { return 0; }
static int GetComputeCapability() {
int dev_id = GetCurDevice();
int major, minor;
CUDA_CALL(cudaDeviceGetAttribute(
&major, cudaDevAttrComputeCapabilityMajor, dev_id));
CUDA_CALL(cudaDeviceGetAttribute(
&minor, cudaDevAttrComputeCapabilityMinor, dev_id));
return major * 10 + minor;
}
static size_t GetCurDevice() {
int dev_id;
cudaGetDevice(&dev_id);
......
......@@ -19,7 +19,7 @@
namespace paddle {
namespace lite {
const int MALLOC_ALIGN = 64;
const int MALLOC_ALIGN = 16;
void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
size_t offset = sizeof(void*) + MALLOC_ALIGN - 1;
......@@ -30,7 +30,6 @@ void* TargetWrapper<TARGET(kHost)>::Malloc(size_t size) {
void* r = reinterpret_cast<void*>(reinterpret_cast<size_t>(p + offset) &
(~(MALLOC_ALIGN - 1)));
static_cast<void**>(r)[-1] = p;
memset(r, 0, size);
return r;
}
void TargetWrapper<TARGET(kHost)>::Free(void* ptr) {
......
if(NOT LITE_WITH_HUAWEI_ASCEND_NPU)
return()
endif()
lite_cc_library(model_client_huawei_ascend_npu SRCS model_client.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs})
lite_cc_library(device_huawei_ascend_npu SRCS device.cc DEPS ${huawei_ascend_npu_runtime_libs} ${huawei_ascend_npu_builder_libs} model_client_huawei_ascend_npu)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/huawei_ascend_npu/device.h"
#include <map>
#include <utility>
#include "ge/ge_api_types.h"
#include "ge/ge_ir_build.h"
#include "graph/graph.h"
#include "lite/utils/io.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
std::shared_ptr<AclModelClient> Device::LoadFromMem(
const std::vector<char>& model_buffer, const int device_id) {
if (model_buffer.size() == 0) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] model_buffer size is ZERO!";
return nullptr;
}
// Create a ACL model client to load the om model
std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
// Load model from memory
if (model_client->LoadFromMem(
reinterpret_cast<const void*>(model_buffer.data()),
model_buffer.size())) {
return model_client;
}
return nullptr;
}
std::shared_ptr<AclModelClient> Device::LoadFromFile(
const std::string& model_path, const int device_id) {
if (!paddle::lite::IsFileExists(model_path)) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] om model file not exists:" << model_path;
return nullptr;
}
// Create a ACL model client to load the om model
std::shared_ptr<AclModelClient> model_client(new AclModelClient(device_id));
// Load model from memory
if (model_client->LoadFromFile(model_path.c_str())) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
return model_client;
}
return nullptr;
}
std::mutex Device::device_mutex_;
bool Device::Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer) {
std::lock_guard<std::mutex> lock(device_mutex_);
// Convert the HiAI IR graph to the HiAI om model
ge::Graph ir_graph("graph");
ir_graph.SetInputs(input_nodes).SetOutputs(output_nodes);
// Build IR model
ge::ModelBufferData om_buffer;
std::map<std::string, std::string> options;
options.insert(std::make_pair(ge::ir_option::LOG_LEVEL, "error"));
ATC_CALL(aclgrphBuildModel(ir_graph, options, om_buffer));
// Copy from om model buffer
model_buffer->resize(om_buffer.length);
memcpy(reinterpret_cast<void*>(model_buffer->data()),
reinterpret_cast<void*>(om_buffer.data.get()),
om_buffer.length);
return true;
}
void Device::InitOnce() {
if (runtime_inited_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] runtime already inited!";
return;
}
// ACL runtime init => can only be called once in one process
ACL_CALL(aclInit(NULL));
// ATC builder init => can only be called once in one process
std::map<std::string, std::string> global_options;
global_options.insert(
std::make_pair(ge::ir_option::SOC_VERSION, "Ascend310"));
ATC_CALL(ge::aclgrphBuildInitialize(global_options));
runtime_inited_ = true;
}
void Device::DestroyOnce() {
if (!runtime_inited_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to destroy runtime!";
return;
}
// ATC builder finalize => can only be called once in one process
ge::aclgrphBuildFinalize();
// ACL runtime finalize => can only be called once in one process
ACL_CALL(aclFinalize());
runtime_inited_ = false;
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <vector>
#include "lite/backends/huawei_ascend_npu/model_client.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() { InitOnce(); }
~Device() { DestroyOnce(); }
std::shared_ptr<AclModelClient> LoadFromMem(
const std::vector<char>& model_buffer, const int device_id);
std::shared_ptr<AclModelClient> LoadFromFile(const std::string& model_path,
const int device_id);
// Build the ACL IR graph to the ACL om model
bool Build(std::vector<ge::Operator>& input_nodes, // NOLINT
std::vector<ge::Operator>& output_nodes, // NOLINT
std::vector<char>* model_buffer); // NOLINT
private:
void InitOnce();
void DestroyOnce();
bool runtime_inited_{false};
static std::mutex device_mutex_;
};
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/huawei_ascend_npu/model_client.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
bool AclModelClient::LoadFromMem(const void* data, uint32_t size) {
if (load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
return true;
}
auto ret = aclmdlQuerySizeFromMem(
data, size, &model_memory_size_, &model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from memory failed!";
return false;
}
ret = aclrtMalloc(
&model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
"failed, require size is "
<< model_memory_size_;
return false;
}
ret = aclrtMalloc(
&model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
"failed, require size is "
<< model_weight_size_;
return false;
}
ret = aclmdlLoadFromMemWithMem(data,
size,
&model_id_,
model_memory_ptr_,
model_memory_size_,
model_weight_ptr_,
model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from memory failed!";
return false;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
return false;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] AclModelClient LoadFromMem success.";
load_flag_ = true;
return true;
}
bool AclModelClient::LoadFromFile(const char* model_path) {
if (load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] model is already loaded!";
return true;
}
auto ret =
aclmdlQuerySize(model_path, &model_memory_size_, &model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] query model size from file failed!";
return false;
}
ret = aclrtMalloc(
&model_memory_ptr_, model_memory_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model memory "
"failed, require size is "
<< model_memory_size_;
return false;
}
ret = aclrtMalloc(
&model_weight_ptr_, model_weight_size_, ACL_MEM_MALLOC_HUGE_FIRST);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] malloc buffer for model weigth "
"failed, require size is "
<< model_weight_size_;
return false;
}
ret = aclmdlLoadFromFileWithMem(model_path,
&model_id_,
model_memory_ptr_,
model_memory_size_,
model_weight_ptr_,
model_weight_size_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Load model from file failed!";
return false;
}
model_desc_ = aclmdlCreateDesc();
if (model_desc_ == nullptr) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] create model description failed!";
return false;
}
ret = aclmdlGetDesc(model_desc_, model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] get model description failed!";
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] Loading model file success:" << model_path;
load_flag_ = true;
return true;
}
bool AclModelClient::GetModelIOTensorDim(
std::vector<TensorDesc>* input_tensor,
std::vector<TensorDesc>* output_tensor) {
if (!model_desc_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] GetModelIOTensorDim failed!";
return false;
}
size_t input_num = aclmdlGetNumInputs(model_desc_);
VLOG(3) << "[HUAWEI_ASCEND_NPU] input numher is " << input_num;
for (size_t i = 0; i < input_num; i++) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] printing input [" << i << "] ....";
aclmdlIODims input_dim;
aclmdlGetInputDims(model_desc_, i, &input_dim);
aclDataType data_type = aclmdlGetInputDataType(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of inputs[" << i << "] is "
<< data_type;
aclFormat data_format = aclmdlGetInputFormat(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of inputs[" << i << "] is "
<< data_format;
TensorDesc tensor_desc = TensorDesc(data_type, input_dim, data_format);
input_tensor->push_back(tensor_desc);
}
size_t output_num = aclmdlGetNumOutputs(model_desc_);
VLOG(3) << "[HUAWEI_ASCEND_NPU] output numher is " << output_num;
for (size_t i = 0; i < output_num; i++) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] printing output [" << i << "] ....";
aclmdlIODims output_dim;
aclmdlGetOutputDims(model_desc_, i, &output_dim);
aclDataType data_type = aclmdlGetOutputDataType(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_type of outputs[" << i << "] is "
<< data_type;
aclFormat data_format = aclmdlGetOutputFormat(model_desc_, i);
VLOG(3) << "[HUAWEI_ASCEND_NPU] data_format of outputs[" << i << "] is "
<< data_format;
TensorDesc tensor_desc = TensorDesc(data_type, output_dim, data_format);
output_tensor->push_back(tensor_desc);
}
return true;
}
bool AclModelClient::GetTensorFromDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
size_t device_output_num = aclmdlGetDatasetNumBuffers(output_dataset_);
size_t tensor_output_num = reinterpret_cast<size_t>(output_tensor->size());
if (device_output_num != tensor_output_num) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] output number not equal, device number is "
<< device_output_num << "tensor number is " << tensor_output_num;
return false;
}
for (size_t i = 0; i < device_output_num; i++) {
aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(output_dataset_, i);
void* device_data = aclGetDataBufferAddr(buffer_device);
uint32_t device_size = aclGetDataBufferSize(buffer_device);
void* tensor_data = nullptr;
aclError ret = aclrtMallocHost(&tensor_data, device_size);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMallocHost failed, ret " << ret;
return false;
}
ret = aclrtMemcpy(tensor_data,
device_size,
device_data,
device_size,
ACL_MEMCPY_DEVICE_TO_HOST);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] aclrtMemcpy failed, ret " << ret;
return false;
}
if (output_tensor->at(i)->SetData(reinterpret_cast<uint8_t*>(tensor_data),
device_size) != ge::GRAPH_SUCCESS) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] SetData to output tensor failed";
return false;
}
}
VLOG(3)
<< "[HUAWEI_ASCEND_NPU] Get output tensor from output dataset succeed.";
return true;
}
void AclModelClient::CreateInputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor) {
input_dataset_ = aclmdlCreateDataset();
if (input_dataset_ == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create input dataset failed!";
return;
}
for (size_t i = 0; i < input_tensor->size(); i++) {
auto item = input_tensor->at(i);
size_t buffer_size = item->GetSize();
void* buffer_device = nullptr;
aclError ret =
aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] input malloc device buffer failed. size is "
<< buffer_size;
return;
}
void* buffer_data = reinterpret_cast<void*>(item->GetData());
ret = aclrtMemcpy(buffer_device,
buffer_size,
buffer_data,
buffer_size,
ACL_MEMCPY_HOST_TO_DEVICE);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input memcpy failed, buffer size is "
<< buffer_size;
aclrtFree(buffer_device);
return;
}
aclDataBuffer* data_buffer =
aclCreateDataBuffer(buffer_device, buffer_size);
if (data_buffer == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
aclrtFree(buffer_device);
return;
}
if (aclmdlAddDatasetBuffer(input_dataset_, data_buffer) != ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] input aclmdlAddDatasetBuffer failed!";
aclrtFree(buffer_device);
aclDestroyDataBuffer(data_buffer);
return;
}
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateInputDataset succeed.";
}
void AclModelClient::CreateOutputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
output_dataset_ = aclmdlCreateDataset();
if (output_dataset_ == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] create output dataset failed!";
return;
}
size_t output_size = aclmdlGetNumOutputs(model_desc_);
CHECK_EQ(output_size, output_tensor->size());
for (size_t i = 0; i < output_size; i++) {
size_t buffer_size = aclmdlGetOutputSizeByIndex(model_desc_, i);
void* buffer_device = nullptr;
aclError ret =
aclrtMalloc(&buffer_device, buffer_size, ACL_MEM_MALLOC_NORMAL_ONLY);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] output malloc device buffer failed. size is "
<< buffer_size;
return;
}
aclDataBuffer* data_buffer =
aclCreateDataBuffer(buffer_device, buffer_size);
if (data_buffer == nullptr) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclCreateDataBuffer failed!";
aclrtFree(buffer_device);
return;
}
if (aclmdlAddDatasetBuffer(output_dataset_, data_buffer) !=
ACL_ERROR_NONE) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] output aclmdlAddDatasetBuffer failed!";
aclrtFree(buffer_device);
aclDestroyDataBuffer(data_buffer);
return;
}
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] CreateOutputDataset succeed.";
}
bool AclModelClient::ModelExecute(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor) {
// check model exists
if (model_desc_ == nullptr) {
LOG(ERROR)
<< "[HUAWEI_ASCEND_NPU] no model description, model execution failed!";
return false;
}
// create input/output dataset
CreateInputDataset(input_tensor);
CreateOutputDataset(output_tensor);
// model execution
ACL_CALL(aclmdlExecute(model_id_, input_dataset_, output_dataset_));
// get output
if (!GetTensorFromDataset(output_tensor)) {
LOG(ERROR) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset failed, modelId:"
<< model_id_;
return false;
}
VLOG(3) << "[HUAWEI_ASCEND_NPU] GetTensorFromDataset succeed, modelId:"
<< model_id_;
return true;
}
void AclModelClient::DestroyDataset(aclmdlDataset** dataset) {
if (*dataset == nullptr) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] no dataset exists, no need to destroy!";
return;
}
size_t dataset_num = aclmdlGetDatasetNumBuffers(*dataset);
for (size_t i = 0; i < dataset_num; i++) {
aclDataBuffer* buffer_device = aclmdlGetDatasetBuffer(*dataset, i);
void* device_data = aclGetDataBufferAddr(buffer_device);
if (device_data == nullptr) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] failed to get data buffer of deivce data!";
} else {
if (aclrtFree(device_data) != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to free deivce data!";
}
}
if (aclDestroyDataBuffer(buffer_device) != ACL_ERROR_NONE) {
LOG(WARNING)
<< "[HUAWEI_ASCEND_NPU] failed to destroy deivce data buffer!";
}
}
if (aclmdlDestroyDataset(*dataset) != ACL_ERROR_NONE) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] failed to destroy dataset!";
}
*dataset = nullptr;
VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroy dataset success.";
}
bool AclModelClient::UnloadModel() {
if (!load_flag_) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] no need to unload model, load flag is "
<< load_flag_;
return true;
}
DestroyDataset(&input_dataset_);
DestroyDataset(&output_dataset_);
aclError ret = aclmdlUnload(model_id_);
if (ret != ACL_ERROR_NONE) {
LOG(ERROR) << "unload model failed, model id is " << model_id_;
return false;
}
if (model_desc_ != nullptr) {
(void)aclmdlDestroyDesc(model_desc_);
model_desc_ = nullptr;
}
if (model_memory_ptr_ != nullptr) {
aclrtFree(model_memory_ptr_);
model_memory_ptr_ = nullptr;
model_memory_size_ = 0;
}
if (model_weight_ptr_ != nullptr) {
aclrtFree(model_weight_ptr_);
model_weight_ptr_ = nullptr;
model_weight_size_ = 0;
}
load_flag_ = false;
VLOG(3) << "[HUAWEI_ASCEND_NPU] Unload model success, model id " << model_id_;
return true;
}
uint32_t AclModelClient::num_devices() {
uint32_t count = 0;
ACL_CALL(aclrtGetDeviceCount(&count));
return count;
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/huawei_ascend_npu/utils.h"
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
class TensorDesc {
public:
TensorDesc(aclDataType data_type, aclmdlIODims dims, aclFormat format) {
if (format == ACL_FORMAT_NHWC) {
dim_order[1] = 3;
dim_order[2] = 1;
dim_order[3] = 2;
}
// create ge::Tensordesc
ge_tensor_desc_ = new ge::TensorDesc(
GetGeShape(dims), GetGeFormat(format), GetGeDataType(data_type));
CHECK(ge_tensor_desc_ != nullptr);
}
~TensorDesc() { ge_tensor_desc_ = nullptr; }
int64_t GetNumber() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[0]);
}
int64_t GetChannel() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[1]);
}
int64_t GetHeight() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[2]);
}
int64_t GetWidth() const {
return ge_tensor_desc_->GetShape().GetDim(dim_order[3]);
}
const ge::TensorDesc& GetGeTensorDesc() const { return *ge_tensor_desc_; }
private:
ge::Shape GetGeShape(aclmdlIODims dims) {
ge::Shape ge_shape({0, 0, 0, 0});
for (size_t i = 0; i < dims.dimCount; i++) {
if (ge_shape.SetDim(i, dims.dims[i]) != ge::GRAPH_SUCCESS) {
LOG(WARNING) << "[HUAWEI_ASCEND_NPU] ge::Shape SetDim failed!";
} else {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Setting Ge Shape[" << i << "] = <"
<< dims.dims[i] << ">";
}
}
return ge_shape;
}
ge::Format GetGeFormat(aclFormat format) {
ge::Format ge_format = ge::FORMAT_NCHW;
switch (format) {
case ACL_FORMAT_NCHW:
ge_format = ge::FORMAT_NCHW;
break;
case ACL_FORMAT_NHWC:
ge_format = ge::FORMAT_NHWC;
break;
case ACL_FORMAT_ND:
ge_format = ge::FORMAT_ND;
break;
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] format not supported:" << format;
break;
}
return ge_format;
}
ge::DataType GetGeDataType(aclDataType data_type) {
ge::DataType ge_datatype = ge::DT_FLOAT;
switch (data_type) {
case ACL_FLOAT:
ge_datatype = ge::DT_FLOAT;
break;
case ACL_FLOAT16:
ge_datatype = ge::DT_FLOAT16;
break;
case ACL_INT8:
ge_datatype = ge::DT_INT8;
break;
case ACL_INT16:
ge_datatype = ge::DT_INT16;
break;
case ACL_INT32:
ge_datatype = ge::DT_INT32;
break;
case ACL_INT64:
ge_datatype = ge::DT_INT64;
break;
case ACL_BOOL:
ge_datatype = ge::DT_BOOL;
break;
default:
LOG(FATAL) << "[HUAWEI_ASCEND_NPU] data type not supported!";
break;
}
return ge_datatype;
}
private:
ge::TensorDesc* ge_tensor_desc_{nullptr};
// n c h w order, default to ACL_FORMAT_NCHW
std::vector<size_t> dim_order{0, 1, 2, 3};
};
class AclModelClient {
public:
explicit AclModelClient(int device_id) {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Creating Huawei Ascend Device: "
<< device_id;
device_num_ = num_devices();
if (device_id < 0 || device_id >= device_num_) {
LOG(FATAL) << "Failed with invalid device id " << device_id;
return;
}
device_id_ = device_id;
ACL_CALL(aclrtSetDevice(device_id_));
}
~AclModelClient() {
VLOG(3) << "[HUAWEI_ASCEND_NPU] Destroying Huawei Ascend Device: "
<< device_id_;
ACL_CALL(aclrtResetDevice(device_id_));
}
bool LoadFromMem(const void* data, uint32_t size);
bool LoadFromFile(const char* model_path);
bool GetModelIOTensorDim(std::vector<TensorDesc>* input_tensor,
std::vector<TensorDesc>* output_tensor);
bool ModelExecute(std::vector<std::shared_ptr<ge::Tensor>>* input_tensor,
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
bool UnloadModel();
private:
void CreateInputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* input_tensor);
void CreateOutputDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
bool GetTensorFromDataset(
std::vector<std::shared_ptr<ge::Tensor>>* output_tensor);
void DestroyDataset(aclmdlDataset** dataset);
private:
uint32_t num_devices();
private:
int device_id_{0};
int device_num_{0};
aclrtContext context_{nullptr};
bool load_flag_{false};
uint32_t model_id_{0};
size_t model_memory_size_;
size_t model_weight_size_;
void* model_memory_ptr_;
void* model_weight_ptr_;
aclmdlDesc* model_desc_{nullptr};
aclmdlDataset* input_dataset_{nullptr};
aclmdlDataset* output_dataset_{nullptr};
};
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "acl/acl.h"
#include "ge/ge_api_types.h"
#include "ge/ge_ir_build.h"
#include "graph/ge_error_codes.h"
#include "graph/graph.h"
#include "graph/tensor.h"
#include "graph/types.h"
#include "lite/utils/cp_logging.h"
/*
* This file contains some Huawei Ascend NPU specific uitls.
*/
#define ACL_CALL(msg) \
CHECK_EQ(reinterpret_cast<aclError>(msg), ACL_ERROR_NONE) \
<< (msg) << " Huawei Ascend NPU ACL Error: " \
<< ::paddle::lite::huawei_ascend_npu::AclErrorInfo( \
reinterpret_cast<int>(msg))
#define ATC_CALL(msg) \
CHECK_EQ(reinterpret_cast<ge::graphStatus>(msg), ge::GRAPH_SUCCESS) \
<< (msg) << " Huawei Ascend NPU ATC Error: " \
<< ::paddle::lite::huawei_ascend_npu::AtcErrorInfo( \
reinterpret_cast<uint32_t>(msg))
namespace paddle {
namespace lite {
namespace huawei_ascend_npu {
static const char* AtcErrorInfo(uint32_t error) {
switch (error) {
#define LITE_ATC_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_ATC_ERROR_INFO(ge::GRAPH_FAILED); // 0xFFFFFFFF
LITE_ATC_ERROR_INFO(ge::GRAPH_PARAM_INVALID); // 50331649
#undef LITE_ATC_ERROR_INFO
default:
return "unknown error";
break;
}
}
static const char* AclErrorInfo(int error) {
switch (error) {
#define LITE_ACL_ERROR_INFO(xx) \
case xx: \
return #xx; \
break;
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PARAM); // 100000
LITE_ACL_ERROR_INFO(ACL_ERROR_UNINITIALIZE); // 100001
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_INITIALIZE); // 100002
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE); // 100003
LITE_ACL_ERROR_INFO(ACL_ERROR_WRITE_FILE); // 100004
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_FILE_SIZE); // 100005
LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_FILE); // 100006
LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_MISSING_ATTR); // 100007
LITE_ACL_ERROR_INFO(ACL_ERROR_FILE_ATTR_INVALID); // 100008
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DUMP_CONFIG); // 100009
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_PROFILING_CONFIG); // 100010
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_MODEL_ID); // 100011
LITE_ACL_ERROR_INFO(ACL_ERROR_DESERIALIZE_MODEL); // 100012
LITE_ACL_ERROR_INFO(ACL_ERROR_PARSE_MODEL); // 100013
LITE_ACL_ERROR_INFO(ACL_ERROR_READ_MODEL_FAILURE); // 100014
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_SIZE_INVALID); // 100015
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_MISSING_ATTR); // 100016
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_INPUT_NOT_MATCH); // 100017
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_OUTPUT_NOT_MATCH); // 100018
LITE_ACL_ERROR_INFO(ACL_ERROR_MODEL_NOT_DYNAMIC); // 100019
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_TYPE_NOT_MATCH); // 100020
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_INPUT_NOT_MATCH); // 100021
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_OUTPUT_NOT_MATCH); // 100022
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_ATTR_NOT_MATCH); // 100023
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_NOT_FOUND); // 100024
LITE_ACL_ERROR_INFO(ACL_ERROR_OP_LOAD_FAILED); // 100025
LITE_ACL_ERROR_INFO(ACL_ERROR_UNSUPPORTED_DATA_TYPE); // 100026
LITE_ACL_ERROR_INFO(ACL_ERROR_FORMAT_NOT_MATCH); // 100027
LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_NOT_REGISTERED); // 100028
LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_NOT_FOUND); // 100029
LITE_ACL_ERROR_INFO(ACL_ERROR_BIN_SELECTOR_ALREADY_REGISTERED); // 100030
LITE_ACL_ERROR_INFO(ACL_ERROR_KERNEL_ALREADY_REGISTERED); // 100031
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_QUEUE_ID); // 100032
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_SUBSCRIBE); // 100033
LITE_ACL_ERROR_INFO(ACL_ERROR_STREAM_NOT_SUBSCRIBE); // 100034
LITE_ACL_ERROR_INFO(ACL_ERROR_THREAD_NOT_SUBSCRIBE); // 100035
LITE_ACL_ERROR_INFO(ACL_ERROR_WAIT_CALLBACK_TIMEOUT); // 100036
LITE_ACL_ERROR_INFO(ACL_ERROR_REPEAT_FINALIZE); // 100037
LITE_ACL_ERROR_INFO(ACL_ERROR_NOT_STATIC_AIPP); // 100038
LITE_ACL_ERROR_INFO(ACL_ERROR_BAD_ALLOC); // 200000
LITE_ACL_ERROR_INFO(ACL_ERROR_API_NOT_SUPPORT); // 200001
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_DEVICE); // 200002
LITE_ACL_ERROR_INFO(ACL_ERROR_MEMORY_ADDRESS_UNALIGNED); // 200003
LITE_ACL_ERROR_INFO(ACL_ERROR_RESOURCE_NOT_MATCH); // 200004
LITE_ACL_ERROR_INFO(ACL_ERROR_INVALID_RESOURCE_HANDLE); // 200005
LITE_ACL_ERROR_INFO(ACL_ERROR_FEATURE_UNSUPPORTED); // 200006
LITE_ACL_ERROR_INFO(ACL_ERROR_STORAGE_OVER_LIMIT); // 300000
LITE_ACL_ERROR_INFO(ACL_ERROR_INTERNAL_ERROR); // 500000
LITE_ACL_ERROR_INFO(ACL_ERROR_FAILURE); // 500001
LITE_ACL_ERROR_INFO(ACL_ERROR_GE_FAILURE); // 500002
LITE_ACL_ERROR_INFO(ACL_ERROR_RT_FAILURE); // 500003
LITE_ACL_ERROR_INFO(ACL_ERROR_DRV_FAILURE); // 500004
LITE_ACL_ERROR_INFO(ACL_ERROR_PROFILING_FAILURE); // 500005
#undef LITE_ACL_ERROR_INFO
default:
return "unknown error";
break;
}
}
} // namespace huawei_ascend_npu
} // namespace lite
} // namespace paddle
......@@ -33,7 +33,7 @@ std::shared_ptr<hiai::AiModelMngerClient> Device::Load(
// Check HiAI DDK version
const char* ddk_version = model_client->GetVersion();
if (ddk_version) {
LOG(INFO) << "[NPU] HiAI DDK version: " << ddk_version;
VLOG(3) << "[NPU] HiAI DDK version: " << ddk_version;
} else {
LOG(WARNING) << "[NPU] Unable to get HiAI DDK version!";
}
......
......@@ -38,17 +38,20 @@ CLRuntime::~CLRuntime() {
}
bool CLRuntime::Init() {
if (initialized_) {
if (is_cl_runtime_initialized_) {
return true;
}
bool is_platform_init = InitializePlatform();
bool is_device_init = InitializeDevice();
is_init_success_ = is_platform_init && is_device_init;
initialized_ = true;
LOG(INFO) << "is_platform_init:" << is_platform_init;
LOG(INFO) << "is_device_init:" << is_device_init;
if ((is_platform_init == true) && (is_device_init == true)) {
is_platform_device_init_success_ = true;
context_ = CreateContext();
command_queue_ = CreateCommandQueue(context());
return initialized_;
is_cl_runtime_initialized_ = true;
}
return is_cl_runtime_initialized_;
}
cl::Platform& CLRuntime::platform() {
......@@ -64,7 +67,9 @@ cl::Context& CLRuntime::context() {
}
cl::Device& CLRuntime::device() {
CHECK(device_ != nullptr) << "device_ is not initialized!";
if (device_ == nullptr) {
LOG(ERROR) << "device_ is not initialized!";
}
return *device_;
}
......@@ -150,6 +155,14 @@ GpuType CLRuntime::ParseGpuTypeFromDeviceName(std::string device_name) {
}
bool CLRuntime::InitializeDevice() {
VLOG(3) << "device_info_.size():" << device_info_.size();
for (auto i : device_info_) {
VLOG(3) << ">>> " << i.first << " " << i.second;
}
if (device_info_.size() > 0 && device_info_.size() <= 2) {
return false;
}
device_info_["PLACEHOLDER"] = 1;
// ===================== BASIC =====================
// CL_DEVICE_TYPE_GPU
// CL_DEVICE_NAME
......@@ -160,7 +173,7 @@ bool CLRuntime::InitializeDevice() {
status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
CL_CHECK_ERROR(status_);
if (all_devices.empty()) {
LOG(FATAL) << "No OpenCL GPU device found!";
LOG(ERROR) << "No available OpenCL GPU device found!";
return false;
}
device_ = std::make_shared<cl::Device>();
......@@ -313,9 +326,6 @@ bool CLRuntime::InitializeDevice() {
}
std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
if (0 != device_info_.size()) {
return device_info_;
}
InitializeDevice();
return device_info_;
}
......
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/backends/opencl/cl_wrapper.h"
typedef enum {
UNKNOWN = 0,
......@@ -68,6 +69,28 @@ class CLRuntime {
public:
static CLRuntime* Global();
bool OpenCLAvaliableForDevice() {
bool opencl_lib_found = paddle::lite::CLWrapper::Global()->OpenclLibFound();
LOG(INFO) << "opencl_lib_found:" << opencl_lib_found;
if (opencl_lib_found == false) return false;
bool dlsym_success = paddle::lite::CLWrapper::Global()->DlsymSuccess();
LOG(INFO) << "dlsym_success:" << dlsym_success;
if (opencl_lib_found == false) return false;
InitializeDevice();
bool support_fp16 =
static_cast<bool>(device_info_["CL_DEVICE_EXTENSIONS_FP16"]);
LOG(INFO) << "support_fp16:" << support_fp16;
if (support_fp16 == false) return false;
is_device_avaliable_for_opencl_ =
dlsym_success && opencl_lib_found && support_fp16;
LOG(INFO) << "is_device_avaliable_for_opencl_:"
<< is_device_avaliable_for_opencl_;
return is_device_avaliable_for_opencl_;
}
bool Init();
cl::Platform& platform();
......@@ -85,7 +108,7 @@ class CLRuntime {
bool BuildProgram(cl::Program* program, const std::string& options = "");
bool IsInitSuccess() { return is_init_success_; }
bool IsInitSuccess() { return is_platform_device_init_success_; }
std::string cl_path() { return cl_path_; }
......@@ -167,9 +190,11 @@ class CLRuntime {
cl_int status_{CL_SUCCESS};
bool initialized_{false};
bool is_device_avaliable_for_opencl_{false};
bool is_cl_runtime_initialized_{false};
bool is_init_success_{false};
bool is_platform_device_init_success_{false};
};
} // namespace lite
......
......@@ -19,14 +19,16 @@ limitations under the License. */
namespace paddle {
namespace lite {
CLWrapper *CLWrapper::Global() {
static CLWrapper wrapper;
return &wrapper;
}
CLWrapper::CLWrapper() {
CHECK(InitHandle()) << "Fail to initialize the OpenCL library!";
InitFunctions();
opencl_lib_found_ = InitHandle();
CHECK(opencl_lib_found_) << "Fail to initialize the OpenCL library!";
dlsym_success_ = InitFunctions();
}
bool CLWrapper::InitHandle() {
......@@ -68,15 +70,17 @@ bool CLWrapper::InitHandle() {
}
}
void CLWrapper::InitFunctions() {
bool CLWrapper::InitFunctions() {
CHECK(handle_ != nullptr) << "The library handle can't be null!";
bool dlsym_success = true;
#define PADDLE_DLSYM(cl_func) \
do { \
cl_func##_ = (cl_func##Type)dlsym(handle_, #cl_func); \
if (cl_func##_ == nullptr) { \
LOG(FATAL) << "Cannot find the " << #cl_func \
LOG(ERROR) << "Cannot find the " << #cl_func \
<< " symbol in libOpenCL.so!"; \
dlsym_success = false; \
break; \
} \
VLOG(4) << "Loaded the " << #cl_func << " symbol successfully."; \
......@@ -137,6 +141,7 @@ void CLWrapper::InitFunctions() {
PADDLE_DLSYM(clEnqueueCopyImage);
#undef PADDLE_DLSYM
return dlsym_success;
}
} // namespace lite
......
......@@ -508,13 +508,20 @@ class CLWrapper final {
return clEnqueueCopyImage_;
}
bool OpenclLibFound() { return opencl_lib_found_; }
bool DlsymSuccess() { return dlsym_success_; }
private:
CLWrapper();
CLWrapper(const CLWrapper &) = delete;
CLWrapper &operator=(const CLWrapper &) = delete;
bool InitHandle();
void InitFunctions();
bool InitFunctions();
bool opencl_lib_found_{true};
bool dlsym_success_{true};
void *handle_{nullptr};
clGetPlatformIDsType clGetPlatformIDs_{nullptr};
clGetPlatformInfoType clGetPlatformInfo_{nullptr};
clBuildProgramType clBuildProgram_{nullptr};
......
......@@ -19,7 +19,7 @@
#include <memory>
#include <string>
#include <type_traits>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/backends/xpu/target_wrapper.h"
namespace paddle {
namespace lite {
......@@ -82,8 +82,8 @@ void DumpXPUMem(const T* ptr,
size_t item_per_line = 30) {
size_t after_stride_len = (len + stride - 1) / stride;
std::unique_ptr<T[]> cpu_mem(new T[len]);
xpu_memcpy(
cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
XPU_CALL(xpu_memcpy(
cpu_mem.get(), ptr, len * sizeof(T), XPUMemcpyKind::XPU_DEVICE_TO_HOST));
std::unique_ptr<T[]> after_stride(new T[after_stride_len]);
for (size_t i = 0; i < after_stride_len; ++i) {
after_stride[i] = cpu_mem[i * stride];
......
......@@ -19,11 +19,11 @@ namespace lite {
void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr};
xpu_malloc(&ptr, size);
XPU_CALL(xpu_malloc(&ptr, size));
return ptr;
}
void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
void TargetWrapperXPU::Free(void* ptr) { XPU_CALL(xpu_free(ptr)); }
void TargetWrapperXPU::MemcpySync(void* dst,
const void* src,
......@@ -31,10 +31,10 @@ void TargetWrapperXPU::MemcpySync(void* dst,
IoDirection dir) {
switch (dir) {
case IoDirection::HtoD:
xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
XPU_CALL(xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE));
break;
case IoDirection::DtoH:
xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
XPU_CALL(xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST));
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
......@@ -49,7 +49,7 @@ XPUScratchPadGuard TargetWrapperXPU::MallocScratchPad(size_t size,
} else {
ptr = TargetWrapperXPU::Malloc(size);
}
CHECK(ptr != nullptr);
CHECK(ptr != nullptr) << "size = " << size << ", use_l3 = " << use_l3;
return XPUScratchPadGuard(new XPUScratchPad(ptr, use_l3));
}
......
......@@ -16,11 +16,23 @@
#include <memory> // std::unique_ptr
#include "lite/backends/xpu/xpu_header_sitter.h" // xpu_free
#include "lite/core/target_wrapper.h"
#include "lite/core/target_wrapper.h" // TargetWrapper
#include "lite/utils/cp_logging.h" // CHECK_EQ
#define XPU_CALL(func) \
{ \
auto e = (func); \
CHECK_EQ(e, 0) << "XPU: (" << #func << ") returns " << e; \
}
namespace paddle {
namespace lite {
// MAX(lod.size()) = 64
const int XPU_MAX_LOD_SIZE = 64;
// MAX(lod[i + 1] - lod[i]) = 512
const int XPU_MAX_LOD_SEQ_LEN = 512;
using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
struct XPUScratchPad {
......@@ -33,7 +45,7 @@ struct XPUScratchPad {
struct XPUScratchPadDeleter {
void operator()(XPUScratchPad* sp) const {
if (!sp->is_l3_) {
xpu_free(sp->addr_);
XPU_CALL(xpu_free(sp->addr_));
}
delete sp;
}
......@@ -55,7 +67,7 @@ class TargetWrapper<TARGET(kXPU)> {
size_t size,
IoDirection dir);
static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = true);
static XPUScratchPadGuard MallocScratchPad(size_t size, bool use_l3 = false);
static xdnn::Context* GetRawContext() {
if (tls_raw_ctx_ == nullptr) {
......@@ -77,11 +89,10 @@ class TargetWrapper<TARGET(kXPU)> {
static void SetDev(int dev_no = 0) {
const char* dev_env = getenv("LITE_XPU_DEV");
if (dev_env) {
xpu_set_device(atoi(dev_env));
return;
dev_no = atoi(dev_env);
}
xpu_set_device(dev_no);
XPU_CALL(xpu_set_device(dev_no));
}
static std::string multi_encoder_precision; // NOLINT
......
......@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${huawei_ascend_npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -24,7 +24,7 @@ namespace arena {
void TestCase::CreateInstruction() {
std::shared_ptr<lite::OpLite> op = nullptr;
static const std::set<TargetType> subgraph_op_supported_targets(
{TARGET(kNPU), TARGET(kXPU)});
{TARGET(kNPU), TARGET(kXPU), TARGET(kHuaweiAscendNPU)});
bool enable_subgraph_op = subgraph_op_supported_targets.find(place_.target) !=
subgraph_op_supported_targets.end();
#if defined(LITE_WITH_XPU) && !defined(LITE_WITH_XTCL)
......@@ -32,25 +32,35 @@ void TestCase::CreateInstruction() {
#endif
if (enable_subgraph_op) {
// Create a new block desc to wrap the original op desc
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
int sub_block_idx = 0;
auto sub_block_desc = new cpp::BlockDesc();
auto sub_block_desc = sub_program_desc->AddBlock<cpp::BlockDesc>();
sub_block_desc->ClearOps();
sub_block_desc->ClearVars();
auto sub_block_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_block_op_desc = *op_desc_;
auto sub_op_desc = sub_block_desc->AddOp<cpp::OpDesc>();
*sub_op_desc = *op_desc_;
// Add the block desc into the subgraph op which used to replace the
// original op
op_desc_.reset(new cpp::OpDesc());
op_desc_->SetType("subgraph");
op_desc_->SetAttr<int32_t>("sub_block", sub_block_idx);
auto in_names = sub_block_op_desc->input_vars();
auto out_names = sub_block_op_desc->output_vars();
auto in_names = sub_op_desc->input_vars();
auto out_names = sub_op_desc->output_vars();
op_desc_->SetInput("Inputs", in_names);
op_desc_->SetOutput("Outputs", out_names);
op_desc_->SetAttr<std::vector<std::string>>("input_data_names", in_names);
// filter only data op (not const op by persisiable)
std::vector<std::string> in_data_names;
for (auto name : in_names) {
if (!(inst_scope_->FindTensor(name)->persistable())) {
in_data_names.push_back(name);
}
}
op_desc_->SetAttr<std::vector<std::string>>("input_data_names",
in_data_names);
op_desc_->SetAttr<std::vector<std::string>>("output_data_names", out_names);
op = LiteOpRegistry::Global().Create(op_desc().Type());
static_cast<operators::SubgraphOp*>(op.get())->SetSubBlock(sub_block_desc);
static_cast<operators::SubgraphOp*>(op.get())->SetProgramDesc(
sub_program_desc);
} else {
op = LiteOpRegistry::Global().Create(op_desc().Type());
}
......@@ -60,7 +70,7 @@ void TestCase::CreateInstruction() {
// filter out the target kernel
CHECK(!kernels.empty()) << "No kernel found for place "
<< place_.DebugString();
auto it = std::remove_if(
auto it = std::find_if(
kernels.begin(), kernels.end(), [&](std::unique_ptr<KernelBase>& k) {
return k->alias() == alias_;
});
......@@ -234,19 +244,6 @@ bool TestCase::CheckPrecision(const std::string& var_name,
return success;
}
TestCase::~TestCase() {
if (op_desc_->Type() == "subgraph") {
// Release the subblock desc of Subgraph op
auto subgraph_op = const_cast<operators::SubgraphOp*>(
static_cast<const operators::SubgraphOp*>(instruction_->op()));
CHECK(subgraph_op);
auto sub_block_desc = subgraph_op->GetSubBlock();
if (sub_block_desc) {
delete sub_block_desc;
}
}
}
} // namespace arena
} // namespace lite
} // namespace paddle
......@@ -46,7 +46,7 @@ class TestCase {
base_scope_(new Scope) {
ctx_ = ContextScheduler::Global().NewContext(place_.target);
}
virtual ~TestCase();
virtual ~TestCase() {}
void Prepare() {
PrepareData();
......
......@@ -17,8 +17,13 @@
namespace paddle {
namespace lite {
#ifdef LITE_WITH_NPU
std::string Context<TargetType::kNPU>::subgraph_model_cache_dir_{""}; // NOLINT
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
thread_local std::string
Context<TargetType::kHuaweiAscendNPU>::subgraph_model_cache_dir_{
""}; // NOLINT
thread_local int
Context<TargetType::kHuaweiAscendNPU>::huawei_ascend_device_id_{
0}; // NOLINT
#endif
#ifdef LITE_WITH_MLU
......
......@@ -39,6 +39,7 @@
#include <utility>
#include <vector>
#include "lite/core/device_info.h"
#include "lite/core/scope.h"
#include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h"
#include "lite/utils/all.h"
......@@ -61,6 +62,7 @@ using FPGAContext = Context<TargetType::kFPGA>;
using BMContext = Context<TargetType::kBM>;
using MLUContext = Context<TargetType::kMLU>;
using RKNPUContext = Context<TargetType::kRKNPU>;
using HuaweiAscendNPUContext = Context<TargetType::kHuaweiAscendNPU>;
template <>
class Context<TargetType::kHost> {
......@@ -84,6 +86,35 @@ class Context<TargetType::kNPU> {
NPUContext& operator=(const NPUContext& ctx) {}
std::string name() const { return "NPUContext"; }
static void SetSubgraphModelCacheDir(Scope* scope,
std::string subgraph_model_cache_dir) {
auto var = scope->Var("SUBGRAPH_MODEL_CACHE_DIR");
CHECK(var);
auto data = var->GetMutable<std::string>();
CHECK(data);
*data = subgraph_model_cache_dir;
}
static std::string SubgraphModelCacheDir(Scope* scope) {
auto var = scope->FindVar("SUBGRAPH_MODEL_CACHE_DIR");
if (!var) return "";
return var->Get<std::string>();
}
};
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
template <>
class Context<TargetType::kHuaweiAscendNPU> {
public:
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(HuaweiAscendNPUContext* ctx) {}
HuaweiAscendNPUContext& operator=(const HuaweiAscendNPUContext& ctx) {
return *this;
}
std::string name() const { return "HuaweiAscendNPUContext"; }
static void SetSubgraphModelCacheDir(std::string subgraph_model_cache_dir) {
subgraph_model_cache_dir_ = subgraph_model_cache_dir;
}
......@@ -91,8 +122,14 @@ class Context<TargetType::kNPU> {
return subgraph_model_cache_dir_;
}
static void SetHuaweiAscendDeviceID(int huawei_ascend_device_id) {
huawei_ascend_device_id_ = huawei_ascend_device_id;
}
static int HuaweiAscendDeviceID() { return huawei_ascend_device_id_; }
private:
static std::string subgraph_model_cache_dir_;
static thread_local std::string subgraph_model_cache_dir_;
static thread_local int huawei_ascend_device_id_;
};
#endif
......@@ -385,6 +422,13 @@ class ContextScheduler {
&ctx->As<NPUContext>());
break;
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
case TARGET(kHuaweiAscendNPU):
kernel_contexts_[TargetType::kHuaweiAscendNPU]
.As<HuaweiAscendNPUContext>()
.CopySharedTo(&ctx->As<HuaweiAscendNPUContext>());
break;
#endif
#ifdef LITE_WITH_APU
case TARGET(kAPU):
kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
......@@ -466,6 +510,9 @@ class ContextScheduler {
#ifdef LITE_WITH_NPU
InitContext<TargetType::kNPU, NPUContext>();
#endif
#ifdef LITE_WITH_HUAWEI_ASCEND_NPU
InitContext<TargetType::kHuaweiAscendNPU, HuaweiAscendNPUContext>();
#endif
#ifdef LITE_WITH_APU
InitContext<TargetType::kAPU, APUContext>();
#endif
......
......@@ -18,6 +18,7 @@ lite_cc_library(mir_passes
fusion/conv_activation_fuse_pass.cc
fusion/var_conv_2d_activation_fuse_pass.cc
fusion/conv_bn_fuse_pass.cc
fusion/conv_conv_fuse_pass.cc
fusion/elementwise_add_activation_fuse_pass.cc
fusion/quant_dequant_fuse_pass.cc
fusion/sequence_pool_concat_fuse_pass.cc
......@@ -32,6 +33,7 @@ lite_cc_library(mir_passes
elimination/identity_dropout_eliminate_pass.cc
elimination/elementwise_mul_constant_eliminate_pass.cc
elimination/remove_tf_redundant_ops_pass.cc
elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.cc
static_kernel_pick_pass.cc
variable_place_inference_pass.cc
type_target_cast_pass.cc
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/elimination/control_flow_op_unused_inputs_and_outputs_eliminate_pass.h"
#include <algorithm>
#include <list>
#include <memory>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
// Remove all of the unused nodes from the contorl flow op and update the inputs
// and outputs of the op info The unused nodes are defined as the nodes which
// are only linked to the control flow op nodes but nerver linked to the other
// op nodes.
//
// For example:
// graph[0]: main block
// in_x
// in_f | in_z(unused node)
// \ | /
// \ | /
// in_w ------- while ------- in_y(unused_node)
// / |
// / |
// (unused node)out_y |
// out_x
//
// graph[1]: sub block
// in_x
// |
// |
// conv2d----in_f
// |
// |
// fc ------in_w
// |
// |
// softmax
// |
// |
// out_x
//
// After the pass is applied:
// in_x
// in_f |
// \ |
// \ |
// in_w ------- while
// |
// |
// |
// out_x
// Remove the var node from var2rm if it is recursively referred to any op in
// the subblock
void CollectUnusedInputOutputNodes(
int block_idx,
std::vector<std::unique_ptr<mir::SSAGraph>>* graphs,
const std::unordered_set<std::string>& control_flow_op_types,
std::unordered_map<std::string, Node*>* in_vars2rm,
std::unordered_map<std::string, Node*>* out_vars2rm) {
auto block_size = graphs->size();
for (auto& op_node : (*graphs)[block_idx]->StmtTopologicalOrder()) {
if (!op_node->IsStmt()) continue;
auto op_info = op_node->AsStmt().op_info();
auto op_type = op_info->Type();
if (control_flow_op_types.count(op_type)) {
int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
CHECK(block_idx >= 0 && block_idx < block_size);
CollectUnusedInputOutputNodes(sub_block_idx,
graphs,
control_flow_op_types,
in_vars2rm,
out_vars2rm);
} else {
for (auto& var_node : op_node->inlinks) {
auto& var_name = var_node->AsArg().name;
if (in_vars2rm->count(var_name)) {
in_vars2rm->erase(var_name);
}
}
for (auto& var_node : op_node->outlinks) {
auto& var_name = var_node->AsArg().name;
// Tensor array may be only used as the output vars in the sublock
if (in_vars2rm->count(var_name)) {
in_vars2rm->erase(var_name);
}
if (out_vars2rm->count(var_name)) {
out_vars2rm->erase(var_name);
}
}
}
}
}
// Remove the unused var nodes from the graph and update the op_info of the
// control flow op
void RemoveNodesFromGraphAndUpdateOpInfo(
SSAGraph* graph,
Node* op_node,
const std::unordered_map<std::string, Node*>& in_vars2rm,
const std::unordered_map<std::string, Node*>& out_vars2rm) {
auto op_info = op_node->AsStmt().mutable_op_info();
auto op_type = op_info->Type();
// Unlink the in_vars2rm and out_vars2rm from the control flow op node, and
// remove them if nerver used.
for (auto& var_node : in_vars2rm) {
VLOG(3) << "in var node '" << var_node.first << "' is unlinked to "
<< op_type;
RemoveDirectedLink(var_node.second, op_node);
}
for (auto& var_node : out_vars2rm) {
VLOG(3) << "out var node '" << var_node.first << "' is unlinked from "
<< op_type;
RemoveDirectedLink(op_node, var_node.second);
// Unlink from all of the out op nodes.
std::unordered_set<Node*> out_op_nodes;
for (auto* out_op_node : var_node.second->outlinks) {
if (!out_op_nodes.count(out_op_node)) {
out_op_nodes.insert(out_op_node);
}
}
for (auto* out_op_node : out_op_nodes) {
RemoveDirectedLink(var_node.second, out_op_node);
}
}
// Remove the unused nodes from the graph if their inlinks and outlinks are
// empty
std::unordered_set<const Node*> removed_var_nodes;
for (auto& var_node : in_vars2rm) {
if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
!removed_var_nodes.count(var_node.second)) {
removed_var_nodes.insert(var_node.second);
graph->RemoveNode(var_node.second);
VLOG(3) << "in var node " << var_node.first << " is removed";
}
}
for (auto& var_node : out_vars2rm) {
if (var_node.second->inlinks.empty() && var_node.second->outlinks.empty() &&
!removed_var_nodes.count(var_node.second)) {
removed_var_nodes.insert(var_node.second);
graph->RemoveNode(var_node.second);
VLOG(3) << "out var node " << var_node.first << " is removed";
}
}
// Update the op info of the control flow op
for (auto& input : *op_info->mutable_inputs()) {
for (auto var = input.second.begin(); var != input.second.end();) {
if (in_vars2rm.count(*var)) {
var = input.second.erase(var);
} else {
++var;
}
}
}
for (auto& output : *op_info->mutable_outputs()) {
for (auto var = output.second.begin(); var != output.second.end();) {
if (out_vars2rm.count(*var)) {
var = output.second.erase(var);
} else {
++var;
}
}
}
}
void ControlFlowOpUnusedInputsAndOutputsEliminatePass::SetAllGraphs(
std::vector<std::unique_ptr<mir::SSAGraph>>* graphs) {
CHECK(graphs && !graphs->empty());
graphs_ = graphs;
}
void ControlFlowOpUnusedInputsAndOutputsEliminatePass::Apply(
const std::unique_ptr<SSAGraph>& graph) {
// Remove the unused input and output nodes from the control flow op nodes
// Which are only linked to the control flow op nodes but nerver linked to the
// other op nodes
const std::unordered_set<std::string> control_flow_op_types = {
"while", "conditional_block"};
auto block_size = graphs_->size();
for (auto& op_node : graph->StmtTopologicalOrder()) {
if (!op_node->IsStmt()) continue;
auto op_info = op_node->AsStmt().mutable_op_info();
auto op_type = op_info->Type();
if (!control_flow_op_types.count(op_type)) continue;
int sub_block_idx = op_info->GetAttr<int32_t>("sub_block");
CHECK(sub_block_idx >= 0 && sub_block_idx < block_size);
// Initialize the unused nodes with all of the input and output nodes
std::unordered_map<std::string, Node *> in_vars2rm, out_vars2rm;
for (auto* var_node : op_node->inlinks) {
auto& var_name = var_node->AsArg().name;
if (!in_vars2rm.count(var_name)) {
in_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
}
}
for (auto* var_node : op_node->outlinks) {
auto& var_name = var_node->AsArg().name;
if (!out_vars2rm.count(var_name)) {
out_vars2rm.insert(std::pair<std::string, Node*>(var_name, var_node));
}
}
// Remove the nodes which used in subblock recursively, and the remaining
// nodes are the unused one.
CollectUnusedInputOutputNodes(sub_block_idx,
graphs_,
control_flow_op_types,
&in_vars2rm,
&out_vars2rm);
if (in_vars2rm.size() > 0 || out_vars2rm.size() > 0) {
// Remove the unused nodes from graph, and update the op info of the
// control flow op
RemoveNodesFromGraphAndUpdateOpInfo(
graph.get(), op_node, in_vars2rm, out_vars2rm);
}
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(
control_flow_op_unused_inputs_and_outputs_eliminate_pass,
paddle::lite::mir::ControlFlowOpUnusedInputsAndOutputsEliminatePass)
.BindTargets({TARGET(kNPU)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <limits>
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "lite/core/mir/pass.h"
#include "lite/core/types.h"
namespace paddle {
namespace lite {
namespace mir {
class ControlFlowOpUnusedInputsAndOutputsEliminatePass : public mir::StmtPass {
public:
void Apply(const std::unique_ptr<SSAGraph> &graph) override;
void SetAllGraphs(std::vector<std::unique_ptr<mir::SSAGraph>> *graphs);
private:
std::vector<std::unique_ptr<mir::SSAGraph>> *graphs_;
};
} // namespace mir
} // namespace lite
} // namespace paddle
......@@ -16,6 +16,9 @@ lite_cc_library(fuse_var_conv_activation
lite_cc_library(fuse_conv_bn
SRCS conv_bn_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_conv_conv
SRCS conv_conv_fuser.cc
DEPS pattern_matcher_high_api)
lite_cc_library(fuse_elementwise_add_activation
SRCS elementwise_add_activation_fuser.cc
DEPS pattern_matcher_high_api)
......@@ -42,6 +45,7 @@ set(mir_fusers
fuse_conv_activation
fuse_var_conv_activation
fuse_conv_bn
fuse_conv_conv
fuse_quant_dequant
fuse_elementwise_add_activation
fuse_transpose_softmax_transpose
......
......@@ -383,10 +383,10 @@ class XPUSingleEncoderFuser : public FuseBase {
op_desc.SetAttr<std::string>("act_type", act_type_);
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
auto* single_encoder_stmt = matched.at("q_mul")->stmt();
fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places());
......
......@@ -373,10 +373,10 @@ class XPUResNetCbamBlock0Fuser : public FuseBase {
auto block0_stmt = matched.at("left_conv1")->stmt();
// block0_stmt->ResetOp(op_desc, graph->valid_places());
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
block0_stmt->SetOp(fake_subgraph_op);
......@@ -693,10 +693,10 @@ class XPUResNetCbamBlock1Fuser : public FuseBase {
auto block1_stmt = matched.at("right_conv1")->stmt();
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
block1_stmt->SetOp(fake_subgraph_op);
......@@ -932,10 +932,10 @@ class XPUResNetCbamBlock2Fuser : public FuseBase {
<< "Y of last fc must have been transposed";
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, scope);
fake_subgraph_op->SetValidPlaces(block2_stmt->op()->valid_places());
block2_stmt->SetOp(fake_subgraph_op);
......
......@@ -315,10 +315,10 @@ class XPUResNetBlock0Fuser : public FuseBase {
auto block0_stmt = matched.at("left_conv1")->stmt();
// block0_stmt->ResetOp(op_desc, graph->valid_places());
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places());
block0_stmt->SetOp(fake_subgraph_op);
......@@ -577,10 +577,10 @@ class XPUResNetBlock1Fuser : public FuseBase {
auto block1_stmt = matched.at("right_conv1")->stmt();
auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph");
// XXX: memleak?
auto sub_block_desc = new cpp::BlockDesc();
auto sub_program_desc = std::make_shared<cpp::ProgramDesc>();
sub_program_desc->AddBlock<cpp::BlockDesc>();
static_cast<operators::SubgraphOp*>(fake_subgraph_op.get())
->SetSubBlock(sub_block_desc);
->SetProgramDesc(sub_program_desc);
fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope());
fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places());
block1_stmt->SetOp(fake_subgraph_op);
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/conv_conv_fuse_pass.h"
#include <memory>
#include <vector>
#include "lite/core/mir/fusion/conv_conv_fuser.h"
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass_registry.h"
namespace paddle {
namespace lite {
namespace mir {
void ConvConvFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
// initialze fuser params
std::vector<bool> conv_has_bias_cases{true, false};
std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};
bool has_arm = false;
for (auto& place : graph->valid_places()) {
if (place.target == TARGET(kARM) && place.precision == PRECISION(kFloat)) {
has_arm = true;
break;
}
}
if (!has_arm) {
return;
}
// only support fp32 fusion
for (auto conv_has_bias0 : conv_has_bias_cases) {
for (auto conv_has_bias1 : conv_has_bias_cases) {
for (auto conv_type0 : conv_type_cases) {
for (auto conv_type1 : conv_type_cases) {
VLOG(4) << "conv_has_bias0:" << conv_has_bias0
<< " conv_type0:" << conv_type0;
VLOG(4) << "conv_has_bias1:" << conv_has_bias1
<< " conv_type1:" << conv_type1;
fusion::ConvConvFuser fuser(
conv_type0, conv_type1, conv_has_bias0, conv_has_bias1);
fuser(graph.get());
}
}
}
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(lite_conv_conv_fuse_pass, paddle::lite::mir::ConvConvFusePass)
.BindTargets({TARGET(kARM)});
......@@ -14,18 +14,19 @@
#pragma once
#include "lite/backends/xpu/xpu_header_sitter.h"
#include <memory>
#include <string>
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {
namespace mir {
struct XPUFreeDeleter {
void operator()(void* p) const { xpu_free(p); }
class ConvConvFusePass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
} // namespace xpu
} // namespace kernels
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/fusion/conv_conv_fuser.h"
#include <memory>
#include <set>
#include <vector>
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
void ConvConvFuser::BuildPattern() {
auto* conv_input0 = VarNode("conv_input0")
->assert_is_op_input(conv_type0_, "Input")
->AsInput();
auto* conv_weight0 = VarNode("conv_weight0")
->assert_is_op_input(conv_type0_, "Filter")
->AsInput();
auto* conv0 = OpNode("conv2d0", conv_type0_)->assert_is_op(conv_type0_);
auto* conv_out0 = VarNode("conv_out0")
->assert_is_op_output(conv_type0_, "Output")
->assert_is_op_input(conv_type1_, "Input")
->AsIntermediate();
auto* conv_weight1 = VarNode("conv_weight1")
->assert_is_op_input(conv_type1_, "Filter")
->AsIntermediate();
auto* conv1 = OpNode("conv2d1", conv_type1_)
->assert_is_op(conv_type1_)
->assert_op_attr<int>("groups", 1)
->AsIntermediate();
auto* conv_out1 = VarNode("conv_out1")
->assert_is_op_output(conv_type1_, "Output")
->AsOutput();
if (conv_has_bias0_) {
if (conv_has_bias1_) {
auto* conv_bias0 = VarNode("conv_bias0")
->assert_is_op_input(conv_type0_, "Bias")
->AsIntermediate();
auto* conv_bias1 = VarNode("conv_bias1")
->assert_is_op_input(conv_type1_, "Bias")
->AsInput();
conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
.LinksTo({conv_out0});
conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
.LinksTo({conv_out1});
} else {
auto* conv_bias0 = VarNode("conv_bias0")
->assert_is_op_input(conv_type0_, "Bias")
->AsIntermediate();
conv0->LinksFrom({conv_input0, conv_weight0, conv_bias0})
.LinksTo({conv_out0});
conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
}
} else {
conv0->LinksFrom({conv_input0, conv_weight0}).LinksTo({conv_out0});
if (conv_has_bias1_) {
auto* conv_bias1 = VarNode("conv_bias1")
->assert_is_op_input(conv_type1_, "Bias")
->AsInput();
conv1->LinksFrom({conv_out0, conv_weight1, conv_bias1})
.LinksTo({conv_out1});
} else {
conv1->LinksFrom({conv_out0, conv_weight1}).LinksTo({conv_out1});
}
}
}
void ConvConvFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
auto conv_instruct = matched.at("conv2d0")->stmt();
auto conv_op_desc = conv_instruct->mutable_op_info();
auto conv = conv_instruct->op();
auto* scope = conv->scope();
auto conv_op_desc1 = matched.at("conv2d1")->stmt()->mutable_op_info();
// conv0
auto weight0_t = scope->FindVar(matched.at("conv_weight0")->arg()->name)
->GetMutable<lite::Tensor>();
// conv1
auto weight1_t = scope->FindVar(matched.at("conv_weight1")->arg()->name)
->GetMutable<lite::Tensor>();
// auto groups0 = conv_op_desc->GetAttr<int>("groups");
auto groups1 = conv_op_desc1->GetAttr<int>("groups");
auto strides1 = conv_op_desc1->GetAttr<std::vector<int>>("strides");
auto paddings1 = conv_op_desc1->GetAttr<std::vector<int>>("paddings");
auto dilations1 = conv_op_desc1->GetAttr<std::vector<int>>("dilations");
bool enable0_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
bool enable1_int8 = conv_op_desc1->HasAttr("enable_int8") ? true : false;
int kw = weight1_t->dims()[2];
int kh = weight1_t->dims()[3];
if (!(kw == 1 && kh == 1)) {
return;
}
CHECK_EQ(enable0_int8, enable1_int8) << "The Conv compute type must be same";
CHECK_EQ(groups1, 1) << "The groups of weight1_dim must be 1";
CHECK_EQ(weight0_t->dims()[0], weight1_t->dims()[1])
<< "weight0_dims[0] == weight1_dim[1]";
for (int i = 0; i < strides1.size(); i++) {
CHECK_EQ(strides1[i], 1) << "strides[" << i << "]: " << strides1[i]
<< " must be 1";
}
for (int i = 0; i < paddings1.size(); i++) {
CHECK_EQ(paddings1[i], 0) << "paddings1[" << i << "]: " << paddings1[i]
<< " must be 0";
}
for (int i = 0; i < dilations1.size(); i++) {
CHECK_EQ(dilations1[i], 1) << "dilations1[" << i << "]: " << dilations1[i]
<< " must be 1";
}
// comupte new_wight and new bias
///////////////////////////////////////////////////////////////////////////////
// Compute ConvConvFuser
// Before fusion
//
// conv(x) = conv(x) = kx + z = y
// conv(y) = ay + b
//
// After fusion:
//
// conv(conv(x)) = a(kx + z) + b = akx + az + b
//
// new_weights = ak
// new_bias = az + b
///////////////////////////////////////////////////////////////////////////////
if (enable0_int8) {
LOG(FATAL) << "it doesn't support";
return;
} else {
// compute new conv_weight
Tensor weight_tensor;
auto in_dims = weight0_t->dims();
auto weight_dims = weight1_t->dims();
const float* din = weight0_t->data<float>();
const float* weights = weight1_t->data<float>();
int oc0 = in_dims[0];
int ic = in_dims[1];
int ih = in_dims[2];
int iw = in_dims[3];
int oc = weight_dims[0];
weight_tensor.Resize({oc, ic, ih, iw});
float* dout = weight_tensor.mutable_data<float>();
ComputeNewWeight(dout, din, weights, oc0, ic, ih, iw, oc);
weight0_t->CopyDataFrom(weight_tensor);
}
// compute new conv_bias
if (conv_has_bias0_ && conv_op_desc->HasInput("Bias") &&
conv_op_desc->Input("Bias").size() > 0) {
auto bias_t0 = scope->FindVar(matched.at("conv_bias0")->arg()->name)
->GetMutable<lite::Tensor>();
if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
conv_op_desc1->Input("Bias").size() > 0) {
auto bias_t1 = scope->FindVar(matched.at("conv_bias1")->arg()->name)
->GetMutable<lite::Tensor>();
Tensor bias;
bias.CopyDataFrom(*bias_t1);
auto bias_data = bias.mutable_data<float>();
ComputeNewBias(bias_data, bias_t0, weight1_t, bias_t1);
bias_t1->CopyDataFrom(bias);
conv_op_desc->SetInput(
"Bias", {matched.at("conv_bias1")->arg()->name}); // conv_bias
IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
} else {
Tensor bias;
auto weight_dims = weight1_t->dims();
bias.Resize({weight_dims[0]});
auto bias_d = bias.mutable_data<float>();
ComputeNewBias(bias_d, bias_t0, weight1_t, nullptr);
bias_t0->CopyDataFrom(bias);
conv_op_desc->SetInput(
"Bias", {matched.at("conv_bias0")->arg()->name}); // conv_bias
}
} else {
if (conv_has_bias1_ && conv_op_desc1->HasInput("Bias") &&
conv_op_desc1->Input("Bias").size() > 0) {
conv_op_desc->SetInput(
"Bias", {matched.at("conv_bias1")->arg()->name}); // conv_bias
IR_NODE_LINK_TO(matched.at("conv_bias1"), matched.at("conv2d0"));
}
}
conv_op_desc->SetType(conv_type0_);
conv_op_desc->SetInput("Input", {matched.at("conv_input0")->arg()->name});
conv_op_desc->SetInput("Filter", {matched.at("conv_weight0")->arg()->name});
conv_op_desc->SetOutput("Output", {matched.at("conv_out1")->arg()->name});
auto update_conv_desc = *conv_instruct->mutable_op_info();
conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
IR_OP_VAR_LINK(matched.at("conv2d0"), matched.at("conv_out1"));
}
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cmath>
#include <memory>
#include <string>
#include "lite/core/mir/pattern_matcher_high_api.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace mir {
namespace fusion {
class ConvConvFuser : public FuseBase {
public:
explicit ConvConvFuser(const std::string& conv_type0,
const std::string& conv_type1,
const bool conv_has_bias0,
const bool conv_has_bias1)
: conv_type0_(conv_type0),
conv_type1_(conv_type1),
conv_has_bias0_(conv_has_bias0),
conv_has_bias1_(conv_has_bias1) {}
void BuildPattern() override;
void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;
private:
void ComputeNewWeight(float* dout,
const float* din,
const float* weights,
int oc0,
int ic,
int ih,
int iw,
int oc1) {
// input conv_weight0_t weights conv_weight1_t
// output weight_tensor
// ksize = 1
int in_size = ih * iw;
int in_channel_size = ic * in_size;
// out = w1[j, i, ih, iw] * w2[k, j, kw, kh]
// out_dim = [oc1, ic, kh, kw], din_dim = [oc0, ic, kh, kw]
// weight_dim = [oc1, oc0, kh, kw]
for (int k = 0; k < oc1; k++) {
const float* weights_ptr = weights + k * oc0;
float* out_ptr = dout + k * in_channel_size;
for (int c = 0; c < ic; c++) {
float* out_ptr_channel = out_ptr + c * in_size;
const float* din_ptr = din + c * in_size;
for (int i = 0; i < in_size; i++) {
float sum = 0.f;
for (int j = 0; j < oc0; j++) {
sum += din_ptr[j * in_channel_size] * weights_ptr[j];
}
*out_ptr_channel++ = sum;
}
}
}
}
void ComputeNewBias(float* dout,
Tensor* bias0_tensor,
Tensor* weight_tensor,
Tensor* bias1_tensor) {
// input bias0_tensor weight_tensor bias1_tensor
// output bias_tensor
auto in_dims = bias0_tensor->dims();
auto weight_dims = weight_tensor->dims();
const float* din = bias0_tensor->data<float>();
const float* weights = weight_tensor->data<float>();
int ic = in_dims[0];
int oc = weight_dims[0];
// out_k = b0[num, j, 1, 1] * w2[k, j, 1, 1]
if (bias1_tensor) {
const float* din2 = bias1_tensor->data<float>();
for (int k = 0; k < oc; k++) {
const float* weights_ptr = weights + k * ic;
float sum = 0.f;
for (int j = 0; j < ic; j++) {
sum += din[j] * weights_ptr[j];
}
dout[k] = sum + din2[k];
}
} else {
for (int k = 0; k < oc; k++) {
const float* weights_ptr = weights + k * ic;
float sum = 0.f;
for (int j = 0; j < ic; j++) {
sum += din[j] * weights_ptr[j];
}
dout[k] = sum;
}
}
}
private:
std::string conv_type0_{"conv2d"};
std::string conv_type1_{"conv2d"};
bool conv_has_bias0_{false};
bool conv_has_bias1_{false};
};
} // namespace fusion
} // namespace mir
} // namespace lite
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册