diff --git a/CMakeLists.txt b/CMakeLists.txt index dc2dee11add6c625f16cfae9e9b1c7b20533fb9d..941ee965b25102ae7a813da487fa451677f6f4a8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") message(STATUS "AR tools: ${CMAKE_AR}") + +if(WIN32) + option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) + + set(CMAKE_SUPPRESS_REGENERATION ON) + set(CMAKE_STATIC_LIBRARY_PREFIX lib) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + + if (MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + endif() + + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + add_compile_options(/MP) + message(STATUS "Using parallel compiling (/MP)") + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + +endif() + if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) find_package(CUDA QUIET) endif() @@ -59,8 +84,10 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) +lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF) lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) +lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) @@ -68,7 +95,7 @@ lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF) lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF) lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF) -lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE) +lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF) lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF) lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF) lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF) @@ -104,9 +131,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) + if(WIN32) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" + FORCE) + else() + set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" FORCE) + endif() endif() message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") @@ -128,6 +162,10 @@ if (LITE_WITH_PYTHON) include(external/pybind11) # download, build, install pybind11 endif() +if(LITE_WITH_RKNPU) + include(device/rknpu) +endif() + # for mobile if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) @@ -184,6 +222,7 @@ endif() include(external/mklml) # download mklml package include(external/xbyak) # download xbyak package + include(external/libxsmm) # download, build, install libxsmm include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -208,7 +247,9 @@ include(generic) # simplify cmake module include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(version) # set PADDLE_VERSION -include(flags) +if(NOT APPLE) + include(flags) +endif() set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") diff --git a/build.bat b/build.bat new file mode 100644 index 0000000000000000000000000000000000000000..4510ee774ed9a3b9fe5a9d55b405b1dae39c3f45 --- /dev/null +++ b/build.bat @@ -0,0 +1,134 @@ +@echo off +setlocal +setlocal enabledelayedexpansion + +set source_path=%~dp0 +rem global variables +set BUILD_EXTRA=OFF +set BUILD_JAVA=ON +set BUILD_PYTHON=OFF +set BUILD_DIR=%source_path% +set OPTMODEL_DIR="" +set BUILD_TAILOR=OFF +set BUILD_CV=OFF +set SHUTDOWN_LOG=ON + +set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz + +set workspace=%source_path% + +:set_vcvarsall_dir +SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat =======>" +set tmp_var=!vcvarsall_dir! +call:remove_space +set vcvarsall_dir=!tmp_var! +IF NOT EXIST "%vcvarsall_dir%" ( + echo "------------%vcvarsall_dir% not exist------------" + goto set_vcvarsall_dir +) + +call:prepare_thirdparty + +if EXIST "%build_directory%" ( + call:rm_rebuild_dir "%build_directory%" + md "%build_directory%" +) + +set root_dir=%workspace% +set build_directory=%BUILD_DIR%\build.lite.x86 +set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code +set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug + +rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake. +rem here we fake an empty file to make cmake works. +if NOT EXIST "%GEN_CODE_PATH_PREFIX%" ( + md "%GEN_CODE_PATH_PREFIX%" +) + +type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc" + +if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" ( + md "%DEBUG_TOOL_PATH_PREFIX%" +) + +copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\" + +cd "%build_directory%" + + cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_MKL=ON ^ + -DWITH_MKLDNN=OFF ^ + -DLITE_WITH_X86=ON ^ + -DLITE_WITH_PROFILE=OFF ^ + -DWITH_LITE=ON ^ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^ + -DLITE_WITH_ARM=OFF ^ + -DWITH_GPU=OFF ^ + -DLITE_BUILD_EXTRA=ON ^ + -DLITE_WITH_PYTHON=ON ^ + -DPYTHON_EXECUTABLE="%python_path%" + +call "%vcvarsall_dir%" amd64 + +msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1 +goto:eof + +:prepare_thirdparty + SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe =======>" + set tmp_var=!python_path! + call:remove_space + set python_path=!tmp_var! + if "!python_path!"=="" ( + set python_path=python.exe + ) else ( + if NOT exist "!python_path!" ( + echo "------------!python_path! not exist------------" + goto:eof + ) + ) + + if EXIST "%workspace%\third-party" ( + if NOT EXIST "%workspace%\third-party-05b862.tar.gz" ( + echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists." + ) else ( + echo "The directory of third_party exists, the third-party-05b862.tar.gz exists." + call:rm_rebuild_dir "%workspace%\third-party" + !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace% + ) + ) else ( + if NOT EXIST "%workspace%\third-party-05b862.tar.gz" ( + echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists." + call:download_third_party + !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace% + ) else ( + echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists." + !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace% + ) + + ) + git submodule update --init --recursive +goto:eof + +:download_third_party +powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^ +'%workspace%third-party-05b862.tar.gz') +goto:eof + +:rm_rebuild_dir + del /f /s /q "%~1\*.*" >nul 2>&1 + rd /s /q "%~1" >nul 2>&1 +goto:eof + + +:remove_space +:remove_left_space +if "%tmp_var:~0,1%"==" " ( + set "tmp_var=%tmp_var:~1%" + goto remove_left_space +) + +:remove_right_space +if "%tmp_var:~-1%"==" " ( + set "tmp_var=%tmp_var:~0,-1%" + goto remove_left_space +) +goto:eof \ No newline at end of file diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 80c59f19cc4a587b1c33ad796740a4d148a7ec46..1c5b58def0d9383dabf3b5d7f814e96617f8f3b8 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -34,6 +34,15 @@ elseif(SSE3_FOUND) set(SIMD_FLAG ${SSE3_FLAG}) endif() +if(WIN32) + # windows header option for all targets. + add_definitions(-D_XKEYCHECK_H) + + if (NOT MSVC) + message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.") + endif(NOT MSVC) +endif(WIN32) + if(LITE_WITH_CUDA) add_definitions(-DLITE_WITH_CUDA) add_definitions(-DEIGEN_USE_GPU) @@ -70,7 +79,7 @@ endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - if(WIN32) + if(WIN32 OR APPLE) # openmp not support well for now on windows set(OPENMP_FLAGS "") else(WIN32) @@ -134,8 +143,15 @@ if (LITE_WITH_NPU) add_definitions("-DLITE_WITH_NPU") endif() +if (LITE_WITH_RKNPU) + add_definitions("-DLITE_WITH_RKNPU") +endif() + if (LITE_WITH_XPU) add_definitions("-DLITE_WITH_XPU") + if (LITE_WITH_XTCL) + add_definitions("-DLITE_WITH_XTCL") + endif() endif() if (LITE_WITH_OPENCL) @@ -156,9 +172,10 @@ endif() if (LITE_WITH_PROFILE) add_definitions("-DLITE_WITH_PROFILE") - if (LITE_WITH_PRECISION_PROFILE) - add_definitions("-DLITE_WITH_PRECISION_PROFILE") - endif() +endif() + +if (LITE_WITH_PRECISION_PROFILE) + add_definitions("-DLITE_WITH_PRECISION_PROFILE") endif() if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) @@ -177,3 +194,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL") endif(LITE_ON_MODEL_OPTIMIZE_TOOL) +if (LITE_WITH_PYTHON) + add_definitions("-DLITE_WITH_PYTHON") +endif(LITE_WITH_PYTHON) diff --git a/cmake/device/rknpu.cmake b/cmake/device/rknpu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7d430888072b0219bba3112534818d2e10a55579 --- /dev/null +++ b/cmake/device/rknpu.cmake @@ -0,0 +1,55 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_RKNPU) + return() +endif() + +if(NOT DEFINED RKNPU_DDK_ROOT) + set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT}) + if(NOT RKNPU_DDK_ROOT) + message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON") + endif() +endif() + +message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}") +find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h + PATHS ${RKNPU_DDK_ROOT}/include/ NO_DEFAULT_PATH) +if(NOT RKNPU_DDK_INC) + message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include") +endif() + +include_directories("${RKNPU_DDK_ROOT}/include") + +set(RKNPU_SUB_LIB_PATH "lib64") +if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") + set(RKNPU_SUB_LIB_PATH "lib64") +endif() + +if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") + set(RKNPU_SUB_LIB_PATH "lib") +endif() + +find_library(RKNPU_DDK_FILE NAMES rknpu_ddk + PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}) + +if(NOT RKNPU_DDK_FILE) + message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}") +else() + message(STATUS "Found RKNPU_DDK_FILE Library: ${RKNPU_DDK_FILE}") + add_library(rknpu_ddk SHARED IMPORTED GLOBAL) + set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE}) +endif() + +set(rknpu_runtime_libs rknpu_ddk CACHE INTERNAL "rknpu ddk runtime libs") diff --git a/cmake/device/xpu.cmake b/cmake/device/xpu.cmake index 099833ee4cf80968671036cffe89329506bbf091..823048552f3cb5f05375e97e94cd5b5ad63e7563 100644 --- a/cmake/device/xpu.cmake +++ b/cmake/device/xpu.cmake @@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT) message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON") endif() endif() - message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}") -find_path(XPU_SDK_INC NAMES xtcl.h - PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl - NO_DEFAULT_PATH) -if(NOT XPU_SDK_INC) - message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include") -endif() -include_directories("${XPU_SDK_ROOT}/XTCL/include") include_directories("${XPU_SDK_ROOT}/XTDK/include") -find_library(XPU_SDK_XTCL_FILE NAMES xtcl - PATHS ${XPU_SDK_ROOT}/XTCL/so - NO_DEFAULT_PATH) - -if(NOT XPU_SDK_XTCL_FILE) - message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}") - add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE}) -endif() - -find_library(XPU_SDK_TVM_FILE NAMES tvm - PATHS ${XPU_SDK_ROOT}/XTCL/so - NO_DEFAULT_PATH) - -if(NOT XPU_SDK_TVM_FILE) - message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}") - add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE}) -endif() - find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi PATHS ${XPU_SDK_ROOT}/XTDK/shlib NO_DEFAULT_PATH) @@ -82,23 +50,55 @@ else() set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE}) endif() -find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc - PATHS ${XPU_SDK_ROOT}/XTDK/shlib - NO_DEFAULT_PATH) - -find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 - PATHS ${XPU_SDK_ROOT}/XTDK/shlib - NO_DEFAULT_PATH) - -if(NOT XPU_SDK_LLVM_FILE) - message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") -else() - message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") - add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) - set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) +set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs") +set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs") + +if(LITE_WITH_XTCL) + find_path(XPU_SDK_INC NAMES xtcl.h + PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH) + if(NOT XPU_SDK_INC) + message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include") + endif() + include_directories("${XPU_SDK_ROOT}/XTCL/include") + + find_library(XPU_SDK_XTCL_FILE NAMES xtcl + PATHS ${XPU_SDK_ROOT}/XTCL/so + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_XTCL_FILE) + message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}") + add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE}) + endif() + + find_library(XPU_SDK_TVM_FILE NAMES tvm + PATHS ${XPU_SDK_ROOT}/XTCL/so + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_TVM_FILE) + message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}") + add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE}) + endif() + + find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8 + PATHS ${XPU_SDK_ROOT}/XTDK/shlib + NO_DEFAULT_PATH) + + if(NOT XPU_SDK_LLVM_FILE) + message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}") + else() + message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}") + add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL) + set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE}) + endif() + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1") + + set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") + set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") endif() - -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0") - -set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs") -set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs") diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 599e7bba7eaf12da7506ce44e706bd9f50ec6998..5a757659bb036ca99326bc40cc075f761ba6e641 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -36,7 +36,16 @@ else() # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen GIT_TAG - URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + ###################################################################################################### + # url address of eigen before v2.3.0 + # URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + ###################################################################################################### + # url address of eigen since v2.6.0 + # github address: https://github.com/eigenteam/eigen-git-mirror + # we changed the source code to adapt for windows compiling + # git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h + ###################################################################################################### + URL https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} DOWNLOAD_NO_PROGRESS 1 PREFIX ${EIGEN_SOURCE_DIR} diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 142fce816de4f06aa0a36b91e3e4ecb962a8dc2a..8d094d6e064fe57b170d1a50a5457c104d3c3ac2 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) -IF(APPLE) - MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.") - SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE) - return() -ENDIF() - INCLUDE(ExternalProject) SET(MKLML_DST_DIR "mklml") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") @@ -38,7 +32,17 @@ IF(WIN32) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) + SET(MKLML_SHARED_LIB_DEPS ${MKLML_LIB_DIR}/msvcr120.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) +ELSEIF(APPLE) + #TODO(intel-huying): + # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. + SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE) + SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE) + SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml.dylib) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml.dylib) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib) ELSE() #TODO(intel-huying): # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index ae99f4df9a3676ae8f5b2c4c01305ead9b7a8254..57e332f1c103b28a194670de609ee521aa41cdf3 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) SET(py_env "") IF(PYTHONINTERP_FOUND) find_python_module(pip REQUIRED) - find_python_module(numpy REQUIRED) + #find_python_module(numpy REQUIRED) #find_python_module(wheel REQUIRED) #find_python_module(google.protobuf REQUIRED) - FIND_PACKAGE(NumPy REQUIRED) + #FIND_PACKAGE(NumPy REQUIRED) #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0") # MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " # "please use pip to upgrade protobuf. pip install -U protobuf") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 225a3c19a16435c4df6403ff7d1bdd01e628dd72..d859404d559282970d96a735c400f745481e8efa 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -276,7 +276,7 @@ function(cc_library TARGET_NAME) add_dependencies(${TARGET_NAME} mklml) if(WIN32) target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) - else(WIN32) + elseif(NOT APPLE) target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif(WIN32) endif() diff --git a/cmake/lite.cmake b/cmake/lite.cmake index d16e7af3d7a61fff0ef13cf7cfcbd7af542e7c3f..9a633409cd4d1c5e650a4794fcf30b9154c8638a 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -88,6 +88,12 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_RKNPU) + foreach(var ${lite_deps_RKNPU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + if (LITE_WITH_XPU) foreach(var ${lite_deps_XPU_DEPS}) set(deps ${deps} ${var}) @@ -131,7 +137,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -142,6 +148,7 @@ function(lite_cc_library TARGET) CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} ARM_DEPS ${args_ARM_DEPS} CV_DEPS ${args_CV_DEPS} FPGA_DEPS ${args_FPGA_DEPS} @@ -161,8 +168,10 @@ function(lite_cc_library TARGET) else() cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) endif() - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() # collect targets need to compile for lite if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS) add_dependencies(lite_compile_deps ${TARGET}) @@ -177,7 +186,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -191,7 +200,8 @@ function(lite_cc_binary TARGET) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -199,7 +209,9 @@ function(lite_cc_binary TARGET) MLU_DEPS ${args_MLU_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() if (NOT APPLE) # strip binary target to reduce size if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") @@ -226,7 +238,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -248,7 +260,8 @@ function(lite_cc_test TARGET) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -263,7 +276,9 @@ function(lite_cc_test TARGET) "${TARGET}" COMMENT "Strip debug symbols done on final executable file.") endif() - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() file(APPEND ${offline_test_registry_file} "${TARGET}\n") # collect targets need to compile for lite @@ -280,6 +295,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels") set(bm_kernels CACHE INTERNAL "bm kernels") +set(rknpu_kernels CACHE INTERNAL "rknpu kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") @@ -295,12 +311,12 @@ if(LITE_BUILD_TAILOR) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM) +# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -317,9 +333,18 @@ function(add_kernel TARGET device level) if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) return() endif() + if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN)) + return() + endif() if ("${device}" STREQUAL "Host") + if (LITE_ON_MODEL_OPTIMIZE_TOOL) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "ARM") @@ -332,16 +357,11 @@ function(add_kernel TARGET device level) set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "X86") - if (NOT LITE_WITH_X86) + if (NOT LITE_WITH_X86 OR LITE_ON_MODEL_OPTIMIZE_TOOL) foreach(src ${args_SRCS}) file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") endforeach() return() - elseif (LITE_ON_MODEL_OPTIMIZE_TOOL) - foreach(src ${args_SRCS}) - file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") - endforeach() - return() endif() set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "") endif() @@ -381,8 +401,20 @@ function(add_kernel TARGET device level) endif() set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "RKNPU") + if (NOT LITE_WITH_RKNPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "MLU") if (NOT LITE_WITH_MLU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") @@ -426,7 +458,8 @@ function(add_kernel TARGET device level) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} @@ -451,11 +484,13 @@ function(add_operator TARGET level) ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA)) return() endif() + if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN)) + return() + endif() foreach(src ${args_SRCS}) if(LITE_BUILD_TAILOR) @@ -478,7 +513,8 @@ function(add_operator TARGET level) FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} @@ -486,6 +522,29 @@ function(add_operator TARGET level) ) endfunction() +#only for windows +function(create_static_lib TARGET_NAME) + set(libs ${ARGN}) + list(REMOVE_DUPLICATES libs) + set(dummy_index 1) + set(dummy_offset 1) + # the dummy target would be consisted of limit size libraries + set(dummy_limit 60) + list(LENGTH libs libs_len) + + foreach(lib ${libs}) + list(APPEND dummy_list ${lib}) + list(LENGTH dummy_list listlen) + if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len})) + merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list}) + set(dummy_list) + list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index}) + MATH(EXPR dummy_index "${dummy_index}+1") + endif() + MATH(EXPR dummy_offset "${dummy_offset}+1") + endforeach() + merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list}) +endfunction() # Bundle several static libraries into one. function(bundle_static_library tgt_name bundled_tgt_name fake_target) @@ -529,7 +588,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target) set(bundled_tgt_full_name ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}) - #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}") + message(STATUS "bundled_tgt_full_name: ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}") + + if(WIN32) + set(dummy_tgt_name dummy_${bundled_tgt_name}) + create_static_lib(${bundled_tgt_name} ${static_libs}) + add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name}) + add_dependencies(${fake_target} ${tgt_name}) + + add_library(${dummy_tgt_name} STATIC IMPORTED) + set_target_properties(${dummy_tgt_name} + PROPERTIES + IMPORTED_LOCATION ${bundled_tgt_full_name} + INTERFACE_INCLUDE_DIRECTORIES $) + add_dependencies(${dummy_tgt_name} ${fake_target}) + return() + endif() if(NOT IOS) file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index 12dd17c5a302259fb8f903735115106526716194..98c01ae92523593b075ac2335f620a63f52260fd 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -7,7 +7,9 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") +message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") +message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}") @@ -75,6 +77,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_BM) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm") endif(LITE_WITH_BM) + if (LITE_WITH_RKNPU) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu") + endif(LITE_WITH_RKNPU) else() set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") endif() @@ -82,16 +87,59 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") # add python lib if (LITE_WITH_PYTHON) - add_custom_target(publish_inference_python_lib ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + if(WIN32) + set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd") + set(LITE_CORE_DEPS ${LITE_CORE}) + add_custom_command(OUTPUT ${LITE_CORE} + COMMAND cmake -E copy $ ${LITE_CORE} + DEPENDS lite_pybind) + add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS}) + + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd" + DEPENDS copy_lite_pybind + ) + + add_custom_target(publish_inference_python_installer ${TARGET} + COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel + WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ + DEPENDS publish_inference_python_lib) + add_custom_target(publish_inference_python_light_demo ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/" + ) + add_dependencies(publish_inference publish_inference_python_lib) + add_dependencies(publish_inference publish_inference_python_installer) + add_dependencies(publish_inference publish_inference_python_light_demo) + else() + if(APPLE) + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + else() + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + endif() add_custom_target(publish_inference_python_installer ${TARGET} - COMMAND python setup.py bdist_wheel + COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ DEPENDS publish_inference_python_lib) add_custom_target(publish_inference_python_light_demo ${TARGET} @@ -107,10 +155,27 @@ if (LITE_WITH_PYTHON) add_dependencies(publish_inference publish_inference_python_lib) add_dependencies(publish_inference publish_inference_python_installer) add_dependencies(publish_inference publish_inference_python_light_demo) + endif(WIN32) endif() -if (LITE_WITH_X86) - add_custom_target(publish_inference_x86_cxx_lib ${TARGET} +if (LITE_WITH_CUDA OR LITE_WITH_X86) + if(APPLE) + add_custom_target(publish_inference_cxx_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + add_custom_target(publish_inference_third_party ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party") + add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) + add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) + add_dependencies(publish_inference publish_inference_cxx_lib) + add_dependencies(publish_inference publish_inference_third_party) + elseif(NOT WIN32) + add_custom_target(publish_inference_cxx_lib ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" @@ -118,50 +183,76 @@ if (LITE_WITH_X86) COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + add_custom_target(publish_inference_third_party ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party") + add_dependencies(publish_inference_cxx_lib bundle_full_api) + add_dependencies(publish_inference_cxx_lib bundle_light_api) + add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) + add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) + add_dependencies(publish_inference publish_inference_cxx_lib) + add_dependencies(publish_inference publish_inference_third_party) + endif() +endif() + +if (LITE_WITH_X86) + if(WIN32) + add_custom_target(publish_inference_x86_cxx_lib ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + + add_dependencies(publish_inference_x86_cxx_lib bundle_full_api) + add_dependencies(publish_inference_x86_cxx_lib bundle_light_api) + add_dependencies(publish_inference publish_inference_x86_cxx_lib) + + add_custom_target(publish_inference_x86_cxx_demos ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + ) + add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos) + add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3) + + else() + + add_custom_target(publish_inference_x86_cxx_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" ) - add_dependencies(publish_inference_x86_cxx_lib bundle_full_api) - add_dependencies(publish_inference_x86_cxx_lib bundle_light_api) add_dependencies(publish_inference_x86_cxx_lib test_model_bin) - add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared) - add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared) - add_dependencies(publish_inference publish_inference_x86_cxx_lib) add_custom_target(publish_inference_x86_cxx_demos ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" - COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party" COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" ) add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) + add_dependencies(publish_inference publish_inference_x86_cxx_lib) + add_dependencies(publish_inference publish_inference_x86_cxx_demos) + endif() endif() if(LITE_WITH_CUDA) - add_custom_target(publish_inference_cuda_cxx_lib ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" - COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - ) - add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api) - add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api) - add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared) - add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared) - add_dependencies(publish_inference publish_inference_cuda_cxx_lib) - add_custom_target(publish_inference_cuda_cxx_demos ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" - COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" ) - add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos) add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared) -endif(LITE_WITH_CUDA) + add_dependencies(publish_inference publish_inference_cuda_cxx_demos) +endif(LITE_WITH_CUDA) + if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (NOT LITE_ON_TINY_PUBLISH) # add cxx lib @@ -193,7 +284,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) add_dependencies(publish_inference publish_inference_cxx_lib) if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD - COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a) + COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a + COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.so) endif() endif() else() diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 2a93331f4ac179cc35acb65bd9271c68a93d71ad..763b988653b60ec02b54200b232be6b79f41d357 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -8,39 +8,48 @@ if (LITE_ON_TINY_PUBLISH) set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG") set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG") endif() -set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer) -if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) + +set(light_lib_DEPS light_api paddle_api paddle_api_light) + +if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux")) #full api dynamic library - add_library(paddle_full_api_shared SHARED "") - target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc) + lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc + DEPS paddle_api paddle_api_light paddle_api_full) add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto) target_link_libraries(paddle_full_api_shared framework_proto) if(LITE_WITH_X86) add_dependencies(paddle_full_api_shared xxhash) target_link_libraries(paddle_full_api_shared xxhash) - if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) + if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) add_dependencies(paddle_full_api_shared dynload_mklml) endif() + if(WIN32) + target_link_libraries(paddle_full_api_shared shlwapi.lib) + endif() endif() if(LITE_WITH_CUDA) target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") endif(LITE_WITH_CUDA) #light api dynamic library - lite_cc_library(paddle_light_api_shared MODULE - SRCS light_api_shared.cc - DEPS ${light_lib_DEPS} - ARM_DEPS ${arm_kernels} - CV_DEPS paddle_cv_arm - NPU_DEPS ${npu_kernels}) - - target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) - set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") - add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) - add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE}) - set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS}) - add_dependencies(paddle_full_api_shared custom_linker_map) + lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc + DEPS ${light_lib_DEPS} + ARM_DEPS ${arm_kernels} + CV_DEPS paddle_cv_arm + NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} + ) + + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) + target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels}) + if(NOT APPLE AND NOT WIN32) + set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") + add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) + add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE}) + set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS}) + add_dependencies(paddle_full_api_shared custom_linker_map) + endif() else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") @@ -55,6 +64,11 @@ else() # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) endif() + if (LITE_WITH_RKNPU) + # Need to add RKNPU runtime libs dependency + target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs}) + endif() + endif() endif() @@ -65,6 +79,7 @@ if (WITH_TESTING) CUDA_DEPS ${cuda_kernels} X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels}) endif() @@ -78,6 +93,12 @@ if(LITE_WITH_BM) set(cxx_api_deps ${cxx_api_deps} ${bm_deps}) endif() +if(LITE_WITH_RKNPU) + set(light_api_deps ${light_api_deps} ${rknpu_deps}) + set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps}) +endif() + + message(STATUS "get ops ${ops}") message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get CUDA kernels ${cuda_kernels}") @@ -86,6 +107,7 @@ message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get OpenCL kernels ${opencl_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}") +message(STATUS "get RKNPU kernels ${rknpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get BM kernels ${bm_kernels}") message(STATUS "get MLU kernels ${mlu_kernels}") @@ -103,6 +125,7 @@ if (NOT LITE_ON_TINY_PUBLISH) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels}) @@ -124,6 +147,7 @@ lite_cc_library(light_api SRCS light_api.cc CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} @@ -143,6 +167,7 @@ if(WITH_TESTING) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} @@ -188,7 +213,11 @@ if(WITH_TESTING) lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges} - ARGS --model_dir=${LITE_MODEL_DIR}/resnet50) + ARGS --model_dir=${LITE_MODEL_DIR}/classify) + lite_cc_test(test_yolov3_lite_bm SRCS test_yolov3_lite_bm.cc + DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils + ${ops} ${host_kernels} ${bm_kernels} ${bm_bridges} + ARGS --model_dir=${LITE_MODEL_DIR}/yolov3) endif() endif() endif() @@ -240,6 +269,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) + # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc # DEPS ${lite_model_test_DEPS}) @@ -266,7 +296,8 @@ if (NOT LITE_ON_TINY_PUBLISH) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} CL_DEPS ${opencl_kernels} - FPGA_DEPS ${fpga_kernels}) + FPGA_DEPS ${fpga_kernels} + BM_DEPS ${bm_kernels}) # The final inference library for just MobileConfig. bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api) target_link_libraries(paddle_api_full ${cuda_deps}) @@ -282,6 +313,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api program mir_passes paddle_api_light CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -291,6 +323,7 @@ lite_cc_test(test_apis SRCS apis_test.cc X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model @@ -316,7 +349,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_ if (LITE_ON_MODEL_OPTIMIZE_TOOL) message(STATUS "Compiling opt") lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc - DEPS gflags kernel op optimizer mir_passes utils) + DEPS gflags kernel op optimizer mir_passes utils ${host_kernels}) add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h) endif(LITE_ON_MODEL_OPTIMIZE_TOOL) @@ -326,6 +359,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} @@ -347,6 +381,7 @@ if(NOT IOS) MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -360,6 +395,7 @@ if(NOT IOS) MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -373,6 +409,7 @@ if(NOT IOS) MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -383,6 +420,7 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} @@ -395,17 +433,20 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/api/_paddle_use_ops.h b/lite/api/_paddle_use_ops.h index 6da47e53789d651f4a36d0b8d6a7ca1ea5a0a3d3..778b4dc7a8d19bc07d641e2923234d84c59099c5 100644 --- a/lite/api/_paddle_use_ops.h +++ b/lite/api/_paddle_use_ops.h @@ -63,6 +63,7 @@ USE_LITE_OP(swish) USE_LITE_OP(log) USE_LITE_OP(exp) USE_LITE_OP(conv2d_transpose) +USE_LITE_OP(depthwise_conv2d_transpose) USE_LITE_OP(negative) USE_LITE_OP(pad2d) USE_LITE_OP(power) diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc index d53de7bf2ed00fed70bbd1f70729a051e5d7203b..0ce7f6f0d5aa5bb5c7bc66dbeddaa618fa6466e6 100644 --- a/lite/api/benchmark.cc +++ b/lite/api/benchmark.cc @@ -13,7 +13,13 @@ // limitations under the License. #include +#if !defined(_WIN32) #include +#else +#include +#include "lite/backends/x86/port.h" +#endif +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include #include @@ -27,6 +33,9 @@ #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" +DEFINE_string(optimized_model_path, + "", + "the path of the model that is optimized by opt."); DEFINE_string(model_dir, "", "the path of the model, the model and param files is under " @@ -44,7 +53,10 @@ DEFINE_string(input_shape, "set input shapes according to the model, " "separated by colon and comma, " "such as 1,3,244,244"); -DEFINE_string(input_img_path, "", "the path of input image"); +DEFINE_string(input_img_path, + "", + "the path of input image, if not set " + "input_img_path, the input of model will be 1.0."); DEFINE_int32(warmup, 0, "warmup times"); DEFINE_int32(repeats, 1, "repeats times"); DEFINE_int32(power_mode, @@ -57,16 +69,8 @@ DEFINE_int32(power_mode, DEFINE_int32(threads, 1, "threads num"); DEFINE_string(result_filename, "result.txt", - "save benchmark " - "result to the file"); -DEFINE_bool(run_model_optimize, - false, - "if set true, apply model_optimize_tool to " - "model and use optimized model to test. "); -DEFINE_bool(is_quantized_model, - false, - "if set true, " - "test the performance of the quantized model. "); + "save the inference time to the file."); +DEFINE_bool(show_output, false, "Wether to show the output in shell."); namespace paddle { namespace lite_api { @@ -87,10 +91,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) { std::vector vaild_places = { Place{TARGET(kARM), PRECISION(kFloat)}, }; - if (FLAGS_is_quantized_model) { - vaild_places.insert(vaild_places.begin(), - Place{TARGET(kARM), PRECISION(kInt8)}); - } config.set_valid_places(vaild_places); auto predictor = lite_api::CreatePaddlePredictor(config); @@ -106,15 +106,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) { LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; } +int64_t ShapeProduction(const std::vector& shape) { + int64_t num = 1; + for (auto i : shape) { + num *= i; + } + return num; +} + #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK void Run(const std::vector& input_shape, - const std::string& model_dir, + const std::string& model_path, const std::string model_name) { // set config and create predictor lite_api::MobileConfig config; config.set_threads(FLAGS_threads); config.set_power_mode(static_cast(FLAGS_power_mode)); - config.set_model_from_file(model_dir + ".nb"); + config.set_model_from_file(model_path); auto predictor = lite_api::CreatePaddlePredictor(config); @@ -122,10 +130,7 @@ void Run(const std::vector& input_shape, auto input_tensor = predictor->GetInput(0); input_tensor->Resize(input_shape); auto input_data = input_tensor->mutable_data(); - int input_num = 1; - for (size_t i = 0; i < input_shape.size(); ++i) { - input_num *= input_shape[i]; - } + int64_t input_num = ShapeProduction(input_shape); if (FLAGS_input_img_path.empty()) { for (int i = 0; i < input_num; ++i) { input_data[i] = 1.f; @@ -173,26 +178,78 @@ void Run(const std::vector& input_shape, ofs << "average = " << std::setw(12) << avg_res; ofs << std::endl; ofs.close(); + + if (FLAGS_show_output) { + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + float max_value = out_data[0]; + int max_index = 0; + for (int i = 0; i < output_num; i++) { + if (max_value < out_data[i]) { + max_value = out_data[i]; + max_index = i; + } + } + LOG(INFO) << "max_value:" << max_value; + LOG(INFO) << "max_index:" << max_index; + LOG(INFO) << "output data[0:10]:"; + for (int i = 0; i < 10; i++) { + LOG(INFO) << out_data[i]; + } + } } #endif } // namespace lite_api } // namespace paddle +void print_usage() { + std::string help_info = + "Usage: \n" + "./benchmark_bin \n" + " --optimized_model_path (The path of the model that is optimized\n" + " by opt. If the model is optimized, please set the param.) \n" + " type: string \n" + " --model_dir (The path of the model that is not optimized by opt,\n" + " the model and param files is under model_dir.) type: string \n" + " --model_filename (The filename of model file. When the model is\n " + " combined formate, please set model_file. Otherwise, it is not\n" + " necessary to set it.) type: string \n" + " --param_filename (The filename of param file, set param_file when\n" + " the model is combined formate. Otherwise, it is not necessary\n" + " to set it.) type: string \n" + " --input_shape (Set input shapes according to the model, separated by\n" + " colon and comma, such as 1,3,244,244) type: string\n" + " default: 1,3,224,224 \n" + " --input_img_path (The path of input image, if not set\n" + " input_img_path, the input will be 1.0.) type: string \n " + " --power_mode (Arm power mode: 0 for big cluster, 1 for little\n" + " cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n" + " --repeats (Repeats times) type: int32 default: 1 \n" + " --result_filename (Save the inference time to the file.) type: \n" + " string default: result.txt \n" + " --threads (Threads num) type: int32 default: 1 \n" + " --warmup (Warmup times) type: int32 default: 0 \n" + "Note that: \n" + " If load the optimized model, set optimized_model_path. Otherwise, \n" + " set model_dir, model_filename and param_filename according to \n" + " the model. \n"; + LOG(INFO) << help_info; +} + int main(int argc, char** argv) { + // Check inputs gflags::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_model_dir == "" || FLAGS_result_filename == "") { - LOG(INFO) << "please run ./benchmark_bin --help to obtain usage."; + bool is_opt_model = (FLAGS_optimized_model_path != ""); + bool is_origin_model = (FLAGS_model_dir != ""); + if (!is_origin_model && !is_opt_model) { + LOG(INFO) << "Input error, the model path should not be empty.\n"; + print_usage(); exit(0); } - if (FLAGS_model_dir.back() == '/') { - FLAGS_model_dir.pop_back(); - } - std::size_t found = FLAGS_model_dir.find_last_of("/"); - std::string model_name = FLAGS_model_dir.substr(found + 1); - std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2"; - + // Get input shape auto get_shape = [](const std::string& str_shape) -> std::vector { std::vector shape; std::string tmp_str = str_shape; @@ -208,19 +265,31 @@ int main(int argc, char** argv) { } return shape; }; - std::vector input_shape = get_shape(FLAGS_input_shape); - // Output optimized model if needed - if (FLAGS_run_model_optimize) { - paddle::lite_api::OutputOptModel(save_optimized_model_dir); + // Get model_name and run_model_path + std::string model_name; + std::string run_model_path; + if (is_origin_model) { + if (FLAGS_model_dir.back() == '/') { + FLAGS_model_dir.pop_back(); + } + std::size_t found = FLAGS_model_dir.find_last_of("/"); + model_name = FLAGS_model_dir.substr(found + 1); + std::string optimized_model_path = FLAGS_model_dir + "_opt2"; + paddle::lite_api::OutputOptModel(optimized_model_path); + run_model_path = optimized_model_path + ".nb"; + } else { + size_t found1 = FLAGS_optimized_model_path.find_last_of("/"); + size_t found2 = FLAGS_optimized_model_path.find_last_of("."); + size_t len = found2 - found1 - 1; + model_name = FLAGS_optimized_model_path.substr(found1 + 1, len); + run_model_path = FLAGS_optimized_model_path; } #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - // Run inference using optimized model - std::string run_model_dir = - FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir; - paddle::lite_api::Run(input_shape, run_model_dir, model_name); + // Run test + paddle::lite_api::Run(input_shape, run_model_path, model_name); #endif return 0; } diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index 556a9e0af01854ff5c57a14dade72b81ed255964..f4dcac519a0699cbcf1bdd3845d8ae90d7a289ed 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -19,6 +19,7 @@ #include #include #include +#include "lite/api/paddle_use_passes.h" #include "lite/utils/io.h" namespace paddle { @@ -291,10 +292,13 @@ void Predictor::Build(const cpp::ProgramDesc &desc, program_desc_ = desc; // `inner_places` is used to optimize passes std::vector inner_places = valid_places; - inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); - inner_places.emplace_back( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + for (auto &valid_place : valid_places) { + inner_places.emplace_back( + Place(TARGET(kHost), valid_place.precision, valid_place.layout)); + } + // Analysis whether the modle is quantized. + // For quantized model, add place(arm, int8) to inner_places const std::vector quant_dequant_op = { "fake_quantize_abs_max", "fake_quantize_range_abs_max", @@ -317,7 +321,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc, } } if (is_quantized_model) { - inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)}); + inner_places.insert(inner_places.begin(), + Place{TARGET(kARM), PRECISION(kInt8)}); } Program program(desc, scope_, inner_places); diff --git a/lite/api/cxx_api.h b/lite/api/cxx_api.h index e63893cb91e112beb6be50bd661a57b9738e5fb1..146556756af7e0b56ae38b5303e622c97dfe58af 100644 --- a/lite/api/cxx_api.h +++ b/lite/api/cxx_api.h @@ -43,6 +43,7 @@ class LITE_API Predictor { public: // Create an empty predictor. Predictor() { scope_ = std::make_shared(); } + // Create a predictor with the weight variable scope set. explicit Predictor(const std::shared_ptr& root_scope) : scope_(root_scope) {} diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index d4da1c429b5f66085b659047636383ecd546d937..fc1a0648c0fd4e50621cfaf75495da6df6ccd86e 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -20,19 +20,35 @@ #include "lite/core/device_info.h" #include "lite/core/version.h" +#ifndef LITE_ON_TINY_PUBLISH +#include "lite/api/paddle_use_passes.h" +#endif + #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ - !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) + !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__) #include #include "lite/backends/x86/mklml.h" #endif - namespace paddle { namespace lite { void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { config_ = config; + auto places = config.valid_places(); + std::vector passes{}; #ifdef LITE_WITH_CUDA - Env::Init(); + // if kCUDA is included in valid places, it should be initialized first, + // otherwise skip this step. + for (auto &p : places) { + if (p.target == TARGET(kCUDA)) { + Env::Init(); + if (config_.multi_stream()) { + passes = {"multi_stream_analysis_pass"}; + VLOG(3) << "add pass: " << passes[0]; + } + break; + } + } #endif #ifdef LITE_WITH_MLU Env::Init(); @@ -43,8 +59,6 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { config.mlu_first_conv_std(), config.mlu_input_layout()); #endif // LITE_WITH_MLU - auto places = config.valid_places(); - std::vector passes{}; auto use_layout_preprocess_pass = config.model_dir().find("OPENCL_PRE_PRECESS"); VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass; @@ -56,9 +70,8 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { raw_predictor_.Build(config, places, passes); mode_ = config.power_mode(); threads_ = config.threads(); - #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ - !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) + !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__) int num_threads = config.x86_math_library_num_threads(); int real_num_threads = num_threads > 1 ? num_threads : 1; paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads); diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc index b641973a15b2e6abc1cf4c999d759271f7522638..7a7f870a9ac38e4103f3f8a7c6b95a98bb6722db 100644 --- a/lite/api/light_api.cc +++ b/lite/api/light_api.cc @@ -13,13 +13,10 @@ // limitations under the License. #include "lite/api/light_api.h" +#include +#include #include "paddle_use_kernels.h" // NOLINT #include "paddle_use_ops.h" // NOLINT -#ifndef LITE_ON_TINY_PUBLISH -#include "lite/api/paddle_use_passes.h" -#endif - -#include namespace paddle { namespace lite { @@ -32,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file, LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_); } + // For weight quantization of post training, load the int8/16 weights + // for optimized model, and dequant it to fp32. DequantizeWeight(); + BuildRuntimeProgram(cpp_program_desc_); PrepareFeedFetch(); } @@ -139,7 +139,12 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { // 1. Create op first Program program(prog, scope_, {}); - // 2. Create Instructs +// 2. Create Instructs +#ifdef LITE_WITH_OPENCL + using OpenCLContext = Context; + std::unique_ptr local_ctx(new KernelContext()); + local_ctx->As().InitOnce(); +#endif // Create the kernels of the target places, and filter out the specific // kernel with the target alias. @@ -155,7 +160,18 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { return it->alias() == alias; }); CHECK(it != kernels.end()); + +#ifdef LITE_WITH_OPENCL + if ((*it)->target() == TARGET(kOpenCL)) { + std::unique_ptr ctx(new KernelContext()); + (*local_ctx).As().CopySharedTo(&ctx->As()); + (*it)->SetContext(std::move(ctx)); + } else { + (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); + } +#else (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target())); +#endif insts.emplace_back(op, std::move(*it)); } @@ -166,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { } void LightPredictor::DequantizeWeight() { -#define PROCESS_CONV2D_DATA() \ - for (int64_t i = 0; i < h; ++i) { \ - for (int64_t j = 0; j < w; ++j) { \ - fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \ - } \ +#define PROCESS_CONV2D_DATA() \ + for (int64_t i = 0; i < ch; ++i) { \ + for (int64_t j = 0; j < offset; ++j) { \ + fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \ + } \ } -#define PROCESS_FC_DATA() \ - for (int i = 0; i < input_tensor->numel(); i++) { \ - *fp_data = scale_list[0] * (*int_data); \ - ++fp_data; \ - ++int_data; \ +#define PROCESS_FC_DATA() \ + for (int64_t i = 0; i < chin; i++) { \ + for (int64_t j = 0; j < chout; j++) { \ + fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \ + } \ } + auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) { + bool result = false; + if (op_desc->HasAttr("quantization_type")) { + std::string type = op_desc->GetAttr("quantization_type"); + result = (type == "post_weight_abs_max") || + (type == "post_weight_channel_wise_abs_max"); + } else { + result = op_desc->HasAttr("quantize_weight_bits"); + } + return result; + }; + Tensor tmp_tensor; - CHECK(cpp_program_desc_.BlocksSize()); - auto* main_block = cpp_program_desc_.GetBlock(0); - for (size_t k = 0; k < main_block->OpsSize(); ++k) { - auto* op_desc = main_block->GetOp(k); - if (op_desc->HasAttr("quantize_weight_bits")) { // weight quantized op - auto input_names = op_desc->input_vars(); - for (auto& input_name : input_names) { - std::string input_scale_name = input_name + "_quant_scale"; - if (op_desc->HasAttr(input_scale_name)) { // the input is quantized - auto input_tensor = - scope_->FindVar(input_name)->GetMutable(); - tmp_tensor.CopyDataFrom(*input_tensor); - auto scale_list = - op_desc->GetAttr>(input_scale_name); - int quantize_weight_bits = - op_desc->GetAttr("quantize_weight_bits"); - float* fp_data = input_tensor->mutable_data(); - - std::string op_type = op_desc->Type(); - if (op_type == "conv2d" || op_type == "depthwise_conv2d") { - int64_t h = input_tensor->dims()[0]; - int64_t w = input_tensor->numel() / h; - CHECK_EQ(scale_list.size(), h); - if (quantize_weight_bits == 8) { - const int8_t* int_data = tmp_tensor.data(); - PROCESS_CONV2D_DATA() - } else { - const int16_t* int_data = tmp_tensor.data(); - PROCESS_CONV2D_DATA() - } - } else if (op_type == "fc" || op_type == "mul") { - if (quantize_weight_bits == 8) { - const int8_t* int_data = tmp_tensor.data(); - PROCESS_FC_DATA() - } else { - const int16_t* int_data = tmp_tensor.data(); - PROCESS_FC_DATA() + for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) { + auto* block = cpp_program_desc_.GetBlock(i); + for (size_t k = 0; k < block->OpsSize(); ++k) { + auto* op_desc = block->GetOp(k); + if (is_weight_quantized_op(op_desc)) { + auto input_names = op_desc->input_vars(); + for (auto& input_name : input_names) { + std::string input_scale_name = input_name + "_quant_scale"; + if (op_desc->HasAttr(input_scale_name)) { // the input is quantized + auto input_tensor = + scope_->FindVar(input_name)->GetMutable(); + tmp_tensor.CopyDataFrom(*input_tensor); + auto scale_list = + op_desc->GetAttr>(input_scale_name); + + int quantize_weight_bits = + op_desc->GetAttr("quantize_weight_bits"); + CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16); + float* fp_data = input_tensor->mutable_data(); + + std::string op_type = op_desc->Type(); + if (op_type == "conv2d" || op_type == "depthwise_conv2d") { + int64_t ch = input_tensor->dims()[0]; + int64_t offset = input_tensor->numel() / ch; + CHECK_EQ(scale_list.size(), ch); + if (quantize_weight_bits == 8) { + const int8_t* int_data = tmp_tensor.data(); + PROCESS_CONV2D_DATA() + } else { + const int16_t* int_data = tmp_tensor.data(); + PROCESS_CONV2D_DATA() + } + } else if (op_type == "fc" || op_type == "mul") { + int64_t chin = input_tensor->dims()[0]; + int64_t chout = input_tensor->dims()[1]; + CHECK_EQ(scale_list.size(), chout); + if (quantize_weight_bits == 8) { + const int8_t* int_data = tmp_tensor.data(); + PROCESS_FC_DATA() + } else { + const int16_t* int_data = tmp_tensor.data(); + PROCESS_FC_DATA() + } } } } diff --git a/lite/api/light_api_shared.cc b/lite/api/light_api_shared.cc deleted file mode 100644 index cfe3d9de09a646e33c4a116bb3cd087d28aa24c2..0000000000000000000000000000000000000000 --- a/lite/api/light_api_shared.cc +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "lite/api/paddle_api.h" - -namespace paddle { -namespace lite_api { - -void RunModel() { - // 1. Set MobileConfig - MobileConfig mobile_config; - - // 2. Create PaddlePredictor by MobileConfig - std::shared_ptr mobile_predictor = - CreatePaddlePredictor(mobile_config); -} - -} // namespace lite_api -} // namespace paddle diff --git a/lite/api/opt.cc b/lite/api/opt.cc index 51f9b565196d30520f0cf73ea41a01fed0cc49e8..efad7b74e943c29c9af1af5c14ac51621eefe576 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -23,6 +23,7 @@ #include "kernel_src_map.h" // NOLINT #include "lite/api/cxx_api.h" #include "lite/api/paddle_api.h" +#include "lite/api/paddle_use_kernels.h" #include "lite/api/paddle_use_ops.h" #include "lite/api/paddle_use_passes.h" #include "lite/core/op_registry.h" @@ -108,6 +109,10 @@ std::vector ParserValidPlaces() { valid_places.emplace_back(TARGET(kNPU)); } else if (target_repr == "xpu") { valid_places.emplace_back(TARGET(kXPU)); + } else if (target_repr == "rknpu") { + valid_places.emplace_back(TARGET(kRKNPU)); + valid_places.emplace_back( + TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)); } else if (target_repr == "mlu") { valid_places.emplace_back(TARGET(kMLU)); } else { @@ -186,6 +191,7 @@ void PrintOpsInfo(std::set valid_ops = {}) { "kFPGA", "kNPU", "kXPU", + "kRKNPU", "kAny", "kUnk"}; int maximum_optype_length = 0; @@ -250,16 +256,16 @@ void PrintHelpInfo() { " `--param_file=`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" - " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" + " `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " "Paddle-Lite\n" " `--print_supported_ops=true " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`" " Display operators in the input model\n"; std::cout << "opt version:" << opt_version << std::endl << help_info << std::endl; diff --git a/lite/api/paddle_api.cc b/lite/api/paddle_api.cc index 91edb2cda7849211f288d64e00191ddba8f82f19..daef2c66dda5188a1eec25c3d5f045f1fa705e1e 100644 --- a/lite/api/paddle_api.cc +++ b/lite/api/paddle_api.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/api/paddle_api.h" +#include "lite/core/context.h" #include "lite/core/device_info.h" #include "lite/core/target_wrapper.h" #include "lite/core/tensor.h" @@ -203,6 +204,7 @@ void ConfigBase::set_threads(int threads) { #endif } +#ifdef LITE_WITH_MLU void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) { mlu_core_version_ = core_version; } @@ -227,12 +229,32 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const { int CxxConfig::mlu_core_number() const { return mlu_core_number_; } DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; } bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; } -std::vector CxxConfig::mlu_first_conv_mean() const { +const std::vector &CxxConfig::mlu_first_conv_mean() const { return mlu_first_conv_mean_; } -std::vector CxxConfig::mlu_first_conv_std() const { +const std::vector &CxxConfig::mlu_first_conv_std() const { return mlu_first_conv_std_; } +#endif + +void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) { +#ifdef LITE_WITH_XPU + lite::Context::SetWorkspaceL3Size(l3_size); +#else + LOG(WARNING) << "The invoking of the function " + "'set_xpu_workspace_l3_size_per_thread' is ignored, please " + "rebuild it with LITE_WITH_XPU=ON."; +#endif +} + +void CxxConfig::set_xpu_dev_per_thread(int dev_no) { +#ifdef LITE_WITH_XPU + lite::Context::SetDev(dev_no); +#else + LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is " + "ignored, please rebuild it with LITE_WITH_XPU=ON."; +#endif +} // set model data in combined format, `set_model_from_file` refers to loading // model from file, set_model_from_buffer refers to loading model from memory diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index 0cb60bf84fe5063287646f825dc74dc5f51bee11..79ab98da799a99540217d55e3d40b46800f17626 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -136,12 +136,17 @@ class LITE_API CxxConfig : public ConfigBase { #ifdef LITE_WITH_X86 int x86_math_library_math_threads_ = 1; #endif +#ifdef LITE_WITH_CUDA + bool multi_stream_{false}; +#endif +#ifdef LITE_WITH_MLU lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; int mlu_core_number_{1}; DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; bool mlu_use_first_conv_{false}; std::vector mlu_first_conv_mean_; std::vector mlu_first_conv_std_; +#endif public: void set_valid_places(const std::vector& x) { valid_places_ = x; } @@ -169,20 +174,41 @@ class LITE_API CxxConfig : public ConfigBase { return x86_math_library_math_threads_; } #endif +#ifdef LITE_WITH_CUDA + void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } + int multi_stream() const { return multi_stream_; } +#endif +#ifdef LITE_WITH_MLU + // set MLU core version, which is used when compiling MLU kernels void set_mlu_core_version(lite_api::MLUCoreVersion core_version); + // set MLU core number, which is used when compiling MLU kernels void set_mlu_core_number(int core_number); + // set MLU input layout. User can specify layout of input data to be NHWC, + // default is NCHW void set_mlu_input_layout(DataLayoutType layout); + // whether use MLU's first conv kernel. First conv is a special kernel + // provided by MLU, its input is uint8, and also needs two 3-dimentional + // vectors which save all inputs' mean and std values void set_mlu_use_first_conv(bool use_first_conv); + // set the 3-dimentional mean vector used by MLU's first conv void set_mlu_first_conv_mean(const std::vector& mean); + // set the 3-dimentional std vector used by MLU's first conv void set_mlu_first_conv_std(const std::vector& std); lite_api::MLUCoreVersion mlu_core_version() const; int mlu_core_number() const; DataLayoutType mlu_input_layout() const; bool mlu_use_first_conv() const; - std::vector mlu_first_conv_mean() const; - std::vector mlu_first_conv_std() const; + const std::vector& mlu_first_conv_mean() const; + const std::vector& mlu_first_conv_std() const; +#endif + + // XPU only, set the size of the workspace memory from L3 cache for the + // current thread. + void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00); + // XPU only, specify the target device ID for the current thread. + void set_xpu_dev_per_thread(int dev_no = 0); }; /// MobileConfig is the config for the light weight predictor, it will skip diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h index e99127e233bc4adf159a6a567dfb15f6fd784a27..9dc5c9e857243ecb57f785737b00929e36c5d83c 100644 --- a/lite/api/paddle_lite_factory_helper.h +++ b/lite/api/paddle_lite_factory_helper.h @@ -18,20 +18,21 @@ */ #pragma once -#define USE_LITE_OP(op_type__) \ - extern int touch_op_##op_type__(); \ - int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \ - touch_op_##op_type__(); +// some platform-independent defintion +#include "lite/utils/macros.h" + +#define USE_LITE_OP(op_type__) \ + extern int touch_op_##op_type__(); \ + int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__(); #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \ extern int touch_##op_type__##target__##precision__##layout__##alias__(); \ int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \ - __attribute__((unused)) = \ - touch_##op_type__##target__##precision__##layout__##alias__(); + UNUSED = touch_##op_type__##target__##precision__##layout__##alias__(); -#define USE_MIR_PASS(name__) \ - extern bool mir_pass_registry##name__##_fake(); \ - static bool mir_pass_usage##name__ __attribute__((unused)) = \ +#define USE_MIR_PASS(name__) \ + extern bool mir_pass_registry##name__##_fake(); \ + static bool mir_pass_usage##name__ UNUSED = \ mir_pass_registry##name__##_fake(); #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__ diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index aceb047b64f54ac18ac492ef495d32c3180ad4b4..efd22fc22a4180c3cac9f269fc14f6541c16b885 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -72,7 +72,8 @@ const std::string& TargetToStr(TargetType target) { "npu", "xpu", "bm", - "mlu"}; + "mlu", + "rknpu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -113,7 +114,8 @@ const std::string& TargetRepr(TargetType target) { "kNPU", "kXPU", "kMLU", - "kBM"}; + "kBM", + "kRKNPU"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index b2d1e6a08954c7f00ae24cfb6be43dac3b168228..2b271a4872e7e14c48632a2bb1aae56d53145cba 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -54,8 +54,9 @@ enum class TargetType : int { kXPU = 9, kBM = 10, kMLU = 11, + kRKNPU = 12, kAny = 6, // any target - NUM = 12, // number of fields. + NUM = 13, // number of fields. }; enum class PrecisionType : int { kUnk = 0, @@ -101,7 +102,10 @@ enum class ActivationType : int { kTanh = 6, kSwish = 7, kExp = 8, - NUM = 9, + kAbs = 9, + kHardSwish = 10, + kReciprocal = 11, + NUM = 12, }; static size_t PrecisionTypeLength(PrecisionType type) { diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 8f80f0014f7f6213a010035f581ad4dcb715aba1..1eb5af74d29f72fa90712d04c922958755d79265 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -42,11 +42,13 @@ USE_MIR_PASS(type_precision_cast_pass); USE_MIR_PASS(type_layout_cast_pass); USE_MIR_PASS(type_layout_cast_preprocess_pass); USE_MIR_PASS(memory_optimize_pass); +USE_MIR_PASS(multi_stream_analysis_pass); USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(mlu_subgraph_pass); USE_MIR_PASS(mlu_postprocess_pass); -USE_MIR_PASS(subgraph_cast_display_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass); +USE_MIR_PASS(__xpu__resnet_fuse_pass); +USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt index ba0c6eb2404ce1ffc2ad5950ee5a3476d42f01b8..5dfecf8c619d8cf9be7a03fa46b4e86a6e641a29 100644 --- a/lite/api/python/CMakeLists.txt +++ b/lite/api/python/CMakeLists.txt @@ -17,8 +17,12 @@ execute_process( OUTPUT_VARIABLE PADDLE_LITE_COMMIT OUTPUT_STRIP_TRAILING_WHITESPACE ) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in - ${CMAKE_CURRENT_BINARY_DIR}/setup.py) - +if(APPLE) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in + ${CMAKE_CURRENT_BINARY_DIR}/setup.py) +else() + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in + ${CMAKE_CURRENT_BINARY_DIR}/setup.py) +endif() add_subdirectory(pybind) #add_subdirectory(interface) diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py index abf198b97e6e818e1fbe59006f98492640bcee54..72a75d9caaa79fa96e52e8603ae6886aac341009 100644 --- a/lite/api/python/__init__.py +++ b/lite/api/python/__init__.py @@ -11,3 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +import os +import sys + +if os.name =='nt': + current_path = os.path.abspath(os.path.dirname(__file__)) + third_lib_path = current_path + os.sep + 'libs' + os.environ['path'] = third_lib_path+ ';' + os.environ['path'] + sys.path.insert(0, third_lib_path) diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt index b1de18d50c1582b0f872ad38d24939665ab1d3b0..fe4cdb5a73d62afa98fb8c343e8a6a20388e293b 100644 --- a/lite/api/python/pybind/CMakeLists.txt +++ b/lite/api/python/pybind/CMakeLists.txt @@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH) set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base) endif() -lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) +if(WIN32) + lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) + get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(lite_pybind ${os_dependency_modules}) +else() + lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) +endif(WIN32) + if (LITE_ON_TINY_PUBLISH) set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") endif() diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in index 79028fb7493bf55eab74aa76ee51ac79f418ba0a..b04a6077f5aafecf76fed0b0dee5c56919b9302e 100644 --- a/lite/api/python/setup.py.in +++ b/lite/api/python/setup.py.in @@ -34,20 +34,27 @@ else: # core lib of paddlelite is stored as lite.so LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' -PACKAGE_DATA = {'paddlelite': ['lite.so']} +PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']} # put all thirdparty libraries in paddlelite.libs PACKAGE_DATA['paddlelite.libs'] = [] LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) - PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] - + if os.name != 'nt': + PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] + else: + PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll'] + shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH) + PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll'] # link lite.so to paddlelite.libs -COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ -/inference_lite_lib/python/install/lite/lite.so" -if os.system(COMMAND) != 0: - raise Exception("patch third_party libs failed, command: %s" % COMMAND) +if os.name != 'nt': + COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ + /inference_lite_lib/python/install/lite/lite.so" + if os.system(COMMAND) != 0: + raise Exception("patch third_party libs failed, command: %s" % COMMAND) + + # remove unused paddle/libs/__init__.py if os.path.isfile(LIB_PATH+'/__init__.py'): @@ -61,6 +68,14 @@ PACKAGE_DIR = { 'paddlelite': LITE_PATH } +if os.name == 'nt': + # fix the path separator under windows + fix_package_dir = {} + for k, v in PACKAGE_DIR.items(): + fix_package_dir[k] = v.replace('/', '\\') + PACKAGE_DIR = fix_package_dir + + setup( name='paddlelite', version=PADDLELITE_VERSION, diff --git a/lite/api/python/setup_mac.py.in b/lite/api/python/setup_mac.py.in new file mode 100644 index 0000000000000000000000000000000000000000..c8dfe2cc5c13b3105fc1aed404676eefd40877e8 --- /dev/null +++ b/lite/api/python/setup_mac.py.in @@ -0,0 +1,73 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# module of pack whl installer for Paddle-lite + +import shutil +import os +from setuptools import setup, Distribution + + +class BinaryDistribution(Distribution): + 'binary distribution' + def has_ext_modules(foo): + return True + + +# get paddle-lite version, if it's not based on a release tag, we use commit id instead +PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@" +PADDLELITE_TAG = "@PADDLE_LITE_TAG@" +if PADDLELITE_TAG == "": + PADDLELITE_VERSION = PADDLELITE_COMMITE +else: + PADDLELITE_VERSION = PADDLELITE_TAG + +# core lib of paddlelite is stored as lite.so +LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' +PACKAGE_DATA = {'paddlelite': ['lite.so']} +# put all thirdparty libraries in paddlelite.libs +PACKAGE_DATA['paddlelite.libs'] = [] +LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' + +if '${WITH_MKL}' == 'ON': + shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) + shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) + PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib'] + +# link lite.so to paddlelite.libs +COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\ +/inference_lite_lib/python/install/lite/lite.so" +if os.system(COMMAND) != 0: + raise Exception("patch third_party libs failed, command: %s" % COMMAND) + +# remove unused paddle/libs/__init__.py +if os.path.isfile(LIB_PATH+'/__init__.py'): + os.remove(LIB_PATH+'/__init__.py') + +# set dir path of each package +PACKAGE_DIR = { + # The paddle.fluid.proto will be generated while compiling. + # So that package points to other directory. + 'paddlelite.libs': LIB_PATH, + 'paddlelite': LITE_PATH +} + +setup( + name='paddlelite', + version=PADDLELITE_VERSION, + description='Paddle-Lite Library', + packages=['paddlelite', 'paddlelite.libs'], + package_dir=PACKAGE_DIR, + package_data=PACKAGE_DATA, + distclass=BinaryDistribution +) diff --git a/lite/api/test_classify_lite_bm.cc b/lite/api/test_classify_lite_bm.cc index 7da7dc03745aa623e35dec5b344e16de03cf5aca..b2507e28adbe050e4715e0c28a433a259607e7a9 100644 --- a/lite/api/test_classify_lite_bm.cc +++ b/lite/api/test_classify_lite_bm.cc @@ -36,7 +36,8 @@ void TestModel(const std::vector& valid_places) { predictor.Build(FLAGS_model_dir, "", "", valid_places, passes); auto* input_tensor = predictor.GetInput(0); - input_tensor->Resize(DDim(std::vector({1, 3, 224, 224}))); + input_tensor->Resize(DDim( + std::vector({1, 3, FLAGS_im_height, FLAGS_im_width}))); auto* data = input_tensor->mutable_data(); auto item_size = input_tensor->dims().production(); if (FLAGS_input_img_txt_path.empty()) { @@ -67,15 +68,13 @@ void TestModel(const std::vector& valid_places) { << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 << " ms in average."; - auto* out = predictor.GetOutput(0); - ASSERT_EQ(out->dims().size(), 2); - ASSERT_EQ(out->dims()[0], 1); - ASSERT_EQ(out->dims()[1], 1000); - - auto* out_data = out->data(); + auto out = predictor.GetOutputs(); FILE* fp = fopen("result.txt", "wb"); - for (int i = 0; i < out->numel(); i++) { - fprintf(fp, "%f\n", out_data[i]); + for (int i = 0; i < out.size(); i++) { + auto* out_data = out[i]->data(); + for (int j = 0; j < out[i]->numel(); j++) { + fprintf(fp, "%f\n", out_data[j]); + } } fclose(fp); } diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h index a17fc331310cfe17ec36be504b94ddacc724e90f..fa6e20230d68c73b0720606816a4594077278d56 100644 --- a/lite/api/test_helper.h +++ b/lite/api/test_helper.h @@ -15,7 +15,12 @@ #pragma once #include +#if !defined(_WIN32) #include +#else +#include +#include "lite/backends/x86/port.h" +#endif #include #include diff --git a/lite/api/test_yolov3_lite_bm.cc b/lite/api/test_yolov3_lite_bm.cc new file mode 100644 index 0000000000000000000000000000000000000000..d70ecf3c03955286244aa13cfe65f19569a55930 --- /dev/null +++ b/lite/api/test_yolov3_lite_bm.cc @@ -0,0 +1,100 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include "lite/api/cxx_api.h" +#include "lite/api/paddle_use_kernels.h" +#include "lite/api/paddle_use_ops.h" +#include "lite/api/paddle_use_passes.h" +#include "lite/api/test_helper.h" +#include "lite/core/op_registry.h" + +DEFINE_string(input_img_txt_path, + "", + "if set input_img_txt_path, read the img filename as input."); + +namespace paddle { +namespace lite { + +void TestModel(const std::vector& valid_places) { + lite::Predictor predictor; + std::vector passes; + predictor.Build(FLAGS_model_dir, + FLAGS_model_dir + "/model", + FLAGS_model_dir + "/params", + valid_places, + passes); + + auto* input_tensor = predictor.GetInput(0); + input_tensor->Resize(DDim( + std::vector({1, 3, FLAGS_im_height, FLAGS_im_width}))); + auto* data = input_tensor->mutable_data(); + auto item_size = input_tensor->dims().production(); + if (FLAGS_input_img_txt_path.empty()) { + for (int i = 0; i < item_size; i++) { + data[i] = 1; + } + } else { + std::fstream fs(FLAGS_input_img_txt_path, std::ios::in); + if (!fs.is_open()) { + LOG(FATAL) << "open input_img_txt error."; + } + for (int i = 0; i < item_size; i++) { + fs >> data[i]; + } + } + auto* image_tensor = predictor.GetInput(1); + image_tensor->Resize(DDim(std::vector({1, 2}))); + data = image_tensor->mutable_data(); + data[0] = FLAGS_im_height; + data[1] = FLAGS_im_width; + + for (int i = 0; i < FLAGS_warmup; ++i) { + predictor.Run(); + } + + auto start = GetCurrentUS(); + for (int i = 0; i < FLAGS_repeats; ++i) { + predictor.Run(); + } + + LOG(INFO) << "================== Speed Report ==================="; + LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads + << ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats + << ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0 + << " ms in average."; + + auto out = predictor.GetOutputs(); + FILE* fp = fopen("result.txt", "wb"); + for (int i = 0; i < out.size(); i++) { + auto* out_data = out[i]->data(); + for (int j = 0; j < out[i]->numel(); j++) { + fprintf(fp, "%f\n", out_data[j]); + } + } + fclose(fp); +} + +TEST(Yolov3, test_bm) { + std::vector valid_places({Place{TARGET(kBM), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kFloat)}}); + + TestModel(valid_places); +} + +} // namespace lite +} // namespace paddle diff --git a/lite/api/transform_test.cc b/lite/api/transform_test.cc index 896b47a97fb20e6935764e12fbe9ebd646a4f816..e1c315f4a63ffd3ed8f51fa4b73ac88b50835cab 100644 --- a/lite/api/transform_test.cc +++ b/lite/api/transform_test.cc @@ -13,7 +13,9 @@ // limitations under the License. #include +#ifdef PADDLE_WITH_TESTING #include +#endif #include #include #include "lite/api/cxx_api.h" diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index fb459ae3621d1281f0a2433ca6b237a165d078a1..1e8734a6e45ead93bb33024a2e918cdb401265d9 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -8,3 +8,4 @@ add_subdirectory(npu) add_subdirectory(xpu) add_subdirectory(mlu) add_subdirectory(bm) +add_subdirectory(rknpu) diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc index 9f478eab60538eeca38415afea4e0989eff5a04e..1d01642100109d14a413ad5e274606c88bf0005a 100644 --- a/lite/backends/arm/math/activation.cc +++ b/lite/backends/arm/math/activation.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/backends/arm/math/activation.h" +#include #include #include "lite/backends/arm/math/funcs.h" @@ -711,6 +712,47 @@ void act_square(const float* din, float* dout, int size, int threads) { } } +template <> +void act_hard_swish(const float* din, + float* dout, + int size, + float threshold, + float scale, + float offset, + int threads) { + const float* ptr_in = din; + float* ptr_out = dout; + for (int i = 0; i < size; ++i) { + ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) * + ptr_in[0] / scale; + ptr_in++; + ptr_out++; + } +} + +template <> +void act_reciprocal(const float* din, + float* dout, + int size, + int threads) { + const float* ptr_in = din; + float* ptr_out = dout; + for (int i = 0; i < size; ++i) { + ptr_out[0] = 1.0 / ptr_in[0]; + ptr_in++; + ptr_out++; + } +} + +template <> +void act_abs(const float* din, float* dout, int size, int threads) { + for (int i = 0; i < size; ++i) { + dout[0] = (din[0] > 0 ? din[0] : -din[0]); + din++; + dout++; + } +} + #ifdef LITE_WITH_TRAIN template <> void act_square_grad(const float* din, diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h index 63f4418d70db25f98dea2a405de1f4bb6b0b9111..50f60f300bbab9b9f0bcad222f31699b7bfadeab 100644 --- a/lite/backends/arm/math/activation.h +++ b/lite/backends/arm/math/activation.h @@ -72,6 +72,20 @@ void act_rsqrt(const T* din, T* dout, int size, int threads); template void act_square(const T* din, T* dout, int size, int threads); +template +void act_hard_swish(const T* din, + T* dout, + int size, + float threshold, + float scale, + float offset, + int threads); +template +void act_reciprocal(const T* din, T* dout, int size, int threads); + +template +void act_abs(const T* din, T* dout, int size, int threads); + #ifdef LITE_WITH_TRAIN template void act_square_grad( diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc index 65f93453388d7f41d73669f583d189bec9035bb5..e54d70ffbb119d0a91b82f67b77c9d778dea17bf 100644 --- a/lite/backends/arm/math/concat.cc +++ b/lite/backends/arm/math/concat.cc @@ -16,46 +16,3 @@ #include #include #include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void concat_func(const std::vector &input, - const int axis, - lite::Tensor *output) { - int64_t concat_input_size = 1; - int64_t num_cancats = 1; - auto dim_0 = input[0]->dims(); - size_t num = input.size(); - for (int i = axis + 1; i < dim_0.size(); i++) { - concat_input_size *= dim_0[i]; - } - for (int i = 0; i < axis; i++) { - num_cancats *= dim_0[i]; - } - float *dst_ptr = output->mutable_data(); - const int out_concat_axis = output->dims()[axis]; - int64_t offset_concat_axis = 0; - int64_t out_sum = out_concat_axis * concat_input_size; - for (int n = 0; n < num; n++) { - auto dims = input[n]->dims(); - const float *src_ptr = input[n]->data(); - int64_t in_concat_axis = dims[axis]; - float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size; - int64_t in_sum = in_concat_axis * concat_input_size; - for (int i = 0; i < num_cancats; i++) { - std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum); - dout_ptr += out_sum; - src_ptr += in_sum; - } - offset_concat_axis += in_concat_axis; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/concat.h b/lite/backends/arm/math/concat.h index 4c6159e9e09b66edde812e5098e1263963f3e4da..44e8bf73e220f94dca4ba6713debfae77029867a 100644 --- a/lite/backends/arm/math/concat.h +++ b/lite/backends/arm/math/concat.h @@ -25,9 +25,39 @@ namespace lite { namespace arm { namespace math { -void concat_func(const std::vector &input, +template +void concat_func(const std::vector& input, const int axis, - lite::Tensor *output); + lite::Tensor* output) { + size_t num = input.size(); + auto dim_0 = input[0]->dims(); + int64_t concat_input_size = 1; + int64_t num_cancats = 1; + for (int i = axis + 1; i < dim_0.size(); i++) { + concat_input_size *= dim_0[i]; + } + for (int i = 0; i < axis; i++) { + num_cancats *= dim_0[i]; + } + + auto* dst_ptr = output->mutable_data(); + const int out_concat_axis = output->dims()[axis]; + int64_t offset_concat_axis = 0; + int64_t out_sum = out_concat_axis * concat_input_size; + for (int n = 0; n < num; n++) { + auto dims = input[n]->dims(); + auto* src_ptr = input[n]->data(); + int64_t in_concat_axis = dims[axis]; + auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size; + int64_t in_sum = in_concat_axis * concat_input_size; + for (int i = 0; i < num_cancats; i++) { + std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum); + dout_ptr += out_sum; + src_ptr += in_sum; + } + offset_concat_axis += in_concat_axis; + } +} } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/elementwise.cc b/lite/backends/arm/math/elementwise.cc index 47a4d427f5400212a80fc31336e462a1c48bd640..4d08c1e957d43b5b748ffdb90fd14a07a61d0183 100644 --- a/lite/backends/arm/math/elementwise.cc +++ b/lite/backends/arm/math/elementwise.cc @@ -302,10 +302,10 @@ void elementwise_add_grad_broadcast(const float* dout_grad, int pre, int n, int post) { - if (x_grad) { + if (x_grad != nullptr) { elementwise_add_grad(dout_grad, x_grad, pre * n * post); } - if (y_grad) { + if (y_grad != nullptr) { memset(y_grad, 0, n * sizeof(float)); #pragma omp parallel for for (int i = 0; i < pre; ++i) { @@ -582,10 +582,10 @@ void elementwise_sub_grad(const float* dout_grad, float* x_grad, float* y_grad, int num) { - if (x_grad) { + if (x_grad != nullptr) { elementwise_add_grad(dout_grad, x_grad, num); } - if (y_grad) { + if (y_grad != nullptr) { int cnt = num >> 4; int remain = num & 0x0f; float32x4_t minus = vdupq_n_f32(-1); @@ -624,10 +624,10 @@ void elementwise_sub_grad_broadcast(const float* dout_grad, int pre, int n, int post) { - if (x_grad) { + if (x_grad != nullptr) { elementwise_add_grad(dout_grad, x_grad, pre * n * post); } - if (y_grad) { + if (y_grad != nullptr) { memset(y_grad, 0, n * sizeof(float)); #pragma omp parallel for for (int i = 0; i < pre; ++i) { diff --git a/lite/backends/arm/math/reduce_mean.cc b/lite/backends/arm/math/reduce_mean.cc index 56104550d8d68e53ad9a2ac3148887d67480d6f6..a84eef2970b2837159609c1ded1ca0d9991ccfc6 100644 --- a/lite/backends/arm/math/reduce_mean.cc +++ b/lite/backends/arm/math/reduce_mean.cc @@ -198,6 +198,23 @@ void reduce_mean_hw(const float* src, reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in); } +template <> +void mean_grad(const float* out_grad, float* in_grad, int size) { + float grad = out_grad[0] / size; + float32x4_t grad_v = vdupq_n_f32(grad); + int loop = size >> 2; + int remain = size & 3; + +#pragma omp parallel for + for (int i = 0; i < loop; ++i) { + vst1q_f32(in_grad, grad_v); + in_grad += 4; + } + for (int i = 0; i < remain; ++i) { + in_grad[i] = grad; + } +} + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/reduce_mean.h b/lite/backends/arm/math/reduce_mean.h index 277ed209c058b5b4be76ce18a00683610e6afb7a..aaa9ff42c18d0cfa6a7cf11408dfba06a9444adc 100644 --- a/lite/backends/arm/math/reduce_mean.h +++ b/lite/backends/arm/math/reduce_mean.h @@ -83,6 +83,9 @@ void reduce_mean_all(const T* src, int height_in, int width_in); +template +void mean_grad(const T* out_grad, T* in_grad, int size); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt index 35f5f0ce2d93db59cbb856d8008e6f3138633e42..0689bb706ab3bac4b8b97059017181ef24dd8ee4 100644 --- a/lite/backends/cuda/CMakeLists.txt +++ b/lite/backends/cuda/CMakeLists.txt @@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES) nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps}) nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps}) + +lite_cc_library(cuda_context SRCS context.cc DEPS device_info) add_subdirectory(math) diff --git a/lite/backends/cuda/context.cc b/lite/backends/cuda/context.cc new file mode 100644 index 0000000000000000000000000000000000000000..4bac4c442c28848d38bd434d045c7888a1a92ac8 --- /dev/null +++ b/lite/backends/cuda/context.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/context.h" + +namespace paddle { +namespace lite {} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/context.h b/lite/backends/cuda/context.h new file mode 100644 index 0000000000000000000000000000000000000000..5bed30a9603c6f6a48169ae31d66c989bd891836 --- /dev/null +++ b/lite/backends/cuda/context.h @@ -0,0 +1,170 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/backends/cuda/blas.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { + +template +class Context; + +using CUDAContext = Context; + +// Only works with CUDA kernels. +template <> +class Context { + public: + typename Env::Devs& devs = + Env::Global(); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() { + if (devs.size() > 0) { + cublas_fp32_ = std::make_shared>(); + } else { + LOG(INFO) << "No cuda device(s) found, CUDAContext init failed."; + } + } + void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { + CHECK_GT(devs.size(), 0UL) + << "Env is not initialized or current target is not exit!"; + if (dev_id >= static_cast(devs.size())) { + LOG(WARNING) << "device index exceeds the number of devices, set to " + "default device(0)!"; + device_id_ = 0; + } else { + device_id_ = dev_id; + } + if (io_stream_id >= devs[dev_id].max_stream()) { + LOG(WARNING) << "data stream index exceeds the maximum stream number, " + "set to default stream(0)!"; + io_stream_id = 0; + } + if (exec_stream_id >= devs[dev_id].max_stream()) { + LOG(WARNING) << "exec stream index exceeds the maximum stream number, " + "set to default stream(0)!"; + exec_stream_id = 0; + } + + exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id]; + io_stream_ = devs[dev_id].io_streams()[io_stream_id]; + + exec_stream_id_ = exec_stream_id; + io_stream_id_ = io_stream_id; + need_sync_ = false; + } + void CopySharedTo(CUDAContext* ctx) { + CHECK(ctx); + CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; + ctx->cublas_fp32_ = cublas_fp32_; + } + + const cudaStream_t& exec_stream() const { return exec_stream_; } + void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } + + const cudaStream_t& io_stream() const { return io_stream_; } + void SetIoStream(cudaStream_t stream) { io_stream_ = stream; } + + std::shared_ptr> cublas_fp32() { return cublas_fp32_; } + void SetCuBlasFP32(std::shared_ptr> cublas_fp32) { + cublas_fp32_ = cublas_fp32; + } + + const std::vector& input_events() { return input_events_; } + void SetInputEvents(const std::vector& input_events) { + input_events_.clear(); + input_events_.assign(input_events.begin(), input_events.end()); + } + + const std::vector& output_events() { return output_events_; } + void SetOutputEvents(const std::vector& output_events) { + output_events_.clear(); + output_events_.assign(output_events.begin(), output_events.end()); + } + + std::vector all_exec_streams() { + int dev_id = TargetWrapper::GetCurDevice(); + return devs[dev_id].exec_streams(); + } + + void SetSyncStreams(const std::vector& nums) { + sync_streams_.clear(); + std::vector exec_streams = all_exec_streams(); + for (size_t i = 0; i < nums.size(); ++i) { + CHECK(nums[i] >= 0 && nums[i] < static_cast(exec_streams.size())) + << "streams id is not valid"; + sync_streams_.push_back(exec_streams[nums[i]]); + } + InitSyncEvents(nums.size()); + } + + void InitSyncEvents(const int num) { + sync_events_.clear(); + for (int i = 0; i < num; ++i) { + cudaEvent_t eve; + TargetWrapperCuda::CreateEventWithFlags(&eve); + sync_events_.push_back(eve); + } + } + + void SetNeedSync(bool sync) { need_sync_ = sync; } + bool need_sync() const { return need_sync_; } + + void Sync() { + CHECK_EQ(sync_streams_.size(), sync_events_.size()); + for (size_t i = 0; i < sync_events_.size(); ++i) { + TargetWrapperCuda::RecordEvent(sync_events_[i], sync_streams_[i]); + TargetWrapperCuda::StreamSync(exec_stream_, sync_events_[i]); + } + } + + std::string name() const { return "CUDAContext"; } + + CUDAContext& operator=(const CUDAContext& context) { + this->Init( + context.device_id_, context.exec_stream_id_, context.io_stream_id_); + cublas_fp32_ = const_cast(context).cublas_fp32(); + return *this; + } + + private: + int device_id_; + // overall information + int exec_stream_id_; + int io_stream_id_; + cudaStream_t exec_stream_; + cudaStream_t io_stream_; + + // not thread-safe, should allocate for each thread. + std::shared_ptr> cublas_fp32_; + + // kernel information + std::vector input_events_; + std::vector output_events_; + // multi stream sync. + std::vector sync_streams_; + std::vector sync_events_; + bool need_sync_; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/math/utils.h b/lite/backends/cuda/math/utils.h index b6aa9c7d160ad6c8b60b132e4a2bbd7ae1e0b9ff..78aa689ff767e8a454dec3aa48a97ecefafdbe7a 100644 --- a/lite/backends/cuda/math/utils.h +++ b/lite/backends/cuda/math/utils.h @@ -29,6 +29,7 @@ enum class BinaryOperation { kADD = 0, kMUL = 1, kDIV = 2, + kSUB = 3, }; template @@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x, if (type == BinaryOperation::kADD) return x + y; if (type == BinaryOperation::kMUL) return x * y; if (type == BinaryOperation::kDIV) return x / y; + if (type == BinaryOperation::kSUB) return x - y; } template diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index f0105e060f03df3e4d49c358cf314730cdd16393..eff959d992200592c21a024f56713b9abb4b87fb 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -54,19 +51,20 @@ cl::Program &CLContext::GetProgram(const std::string &file_name, void CLContext::AddKernel(const std::string &kernel_name, const std::string &file_name, - const std::string &options) { + const std::string &options, + const std::string &time_stamp) { cl_int status{CL_SUCCESS}; VLOG(3) << " --- to get program " << file_name << " --- "; auto program = GetProgram(file_name, options); VLOG(3) << " --- end get program --- "; VLOG(3) << " --- to create kernel: " << kernel_name << " --- "; - std::unique_ptr kernel( + std::shared_ptr kernel( new cl::Kernel(program, kernel_name.c_str(), &status)); CL_CHECK_FATAL(status); VLOG(3) << " --- end create kernel --- "; kernels_.emplace_back(std::move(kernel)); STL::stringstream kernel_key; - kernel_key << kernel_name << options; + kernel_key << kernel_name << options << time_stamp; kernel_offset_[kernel_key.str()] = kernels_.size() - 1; } @@ -121,14 +119,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) { } } +cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size, + size_t max_work_size, + int divisor) { + int preferred_lws = 0; +#if 1 + auto gws0 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws2 = global_work_size[2]; +#else + auto gws2 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws0 = global_work_size[2]; +#endif + if (divisor > 1) { + max_work_size /= divisor; + } + if (preferred_lws > 0 && preferred_lws <= max_work_size) { + max_work_size = preferred_lws; + } + while (gws1 > max_work_size && max_work_size > 0) { + gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1; + } + while (gws2 * gws1 > max_work_size && max_work_size > 0) { + gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1; + } + while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) { + gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1; + } +#if 1 + return cl::NDRange{static_cast(gws0), + static_cast(gws1), + static_cast(gws2)}; +#else + return cl::NDRange{static_cast(gws2), + static_cast(gws1), + static_cast(gws0)}; +#endif +} + cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size) { int preferred_lws = 0; int divisor = 2; - auto tmp0 = global_work_size[0]; - auto tmp1 = global_work_size[1]; - auto tmp2 = global_work_size[2]; + auto gws0 = global_work_size[0]; + auto gws1 = global_work_size[1]; + auto gws2 = global_work_size[2]; if (divisor > 1) { max_work_size /= divisor; @@ -136,18 +173,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size, if (preferred_lws > 0 && preferred_lws <= max_work_size) { max_work_size = preferred_lws; } - while (tmp1 > max_work_size && max_work_size > 0) { - tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1; + while (gws1 > max_work_size && max_work_size > 0) { + gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1; } - while (tmp2 * tmp1 > max_work_size && max_work_size > 0) { - tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1; + while (gws2 * gws1 > max_work_size && max_work_size > 0) { + gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1; } - while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) { - tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1; + while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) { + gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1; } - return cl::NDRange{static_cast(tmp0), - static_cast(tmp1), - static_cast(tmp2)}; + return cl::NDRange{static_cast(gws0), + static_cast(gws1), + static_cast(gws2)}; } } // namespace lite diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 1964c4bf56b55841ba735c79b2f7a17dc1ed451e..41059a0d42a95bbffed4c41611b9f3b8ac60861c 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -27,6 +27,21 @@ namespace lite { class CLContext { public: + ~CLContext() { + for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { + // Note(ysh329): Don't need `clReleaseKernel` + kernels_[kidx].reset(); + } + kernels_.clear(); + kernel_offset_.clear(); + for (auto &p : programs_) { + // Note(ysh329): Dont't need `clReleaseProgram` + p.second.reset(); + } + programs_.clear(); + LOG(INFO) << "release cl::Program, cl::Kernel finished."; + } + cl::CommandQueue &GetCommandQueue(); cl::Context &GetContext(); @@ -36,7 +51,8 @@ class CLContext { void AddKernel(const std::string &kernel_name, const std::string &file_name, - const std::string &options = ""); + const std::string &options = "", + const std::string &time_stamp = ""); cl::Kernel &GetKernel(const int index); @@ -46,9 +62,15 @@ class CLContext { cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size); + cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size, + size_t max_work_size, + int divitor = 2); + // cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size, + // size_t max_work_size); + private: std::unordered_map> programs_; - std::vector> kernels_; + std::vector> kernels_; std::map kernel_offset_; }; diff --git a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl index cb29860dc7556bdaea3c09589a8c6120c5ef2a1a..08491d5d9fd195430a4b03673c38767f7e4a5be8 100644 --- a/lite/backends/opencl/cl_kernel/image/activation_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/activation_kernel.cl @@ -55,17 +55,20 @@ __kernel void relu6(__read_only image2d_t input, __kernel void sigmoid(__read_only image2d_t input, __write_only image2d_t output, __private const float threshold, - __private const float scale) { + __private const float scale) { + const int x = get_global_id(0); // image_width + const int y = get_global_id(1); // image_height - const int x = get_global_id(0); // image_width - const int y = get_global_id(1); // image_height - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); - CL_DTYPE4 out = 1 / (1 + exp(-in)); + CL_DTYPE4 out; + out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x))); + out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y))); + out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z))); + out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w))); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); } diff --git a/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..72b0b66f9737ce0ca9c740e6d4e399d06eaf2cd8 --- /dev/null +++ b/lite/backends/opencl/cl_kernel/image/box_coder_kernel.cl @@ -0,0 +1,152 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +__kernel void decode_center_size(__read_only image2d_t prior_box_image, + __read_only image2d_t prior_box_var_image, + __read_only image2d_t target_box_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_H){ + const int out_c = get_global_id(0); + const int out_nh = get_global_id(1); + const int out_h = out_nh % out_H; + const int out_n = 1; + + const int prior_box_n = 1; + const int prior_box_c = 0; + const int prior_box_h = out_h; + + const int prior_box_var_n = 1; + const int prior_box_var_c = 0; + const int prior_box_var_h = out_h; + + const int target_box_n = 1; + const int target_box_c = out_c; + const int target_box_h = out_h; + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + int2 prior_box_pos; + int2 prior_box_var_pos; + int2 target_box_pos; + int2 output_pos; + + prior_box_pos.x = prior_box_c * 4; + prior_box_pos.y = prior_box_n * prior_box_h; + + prior_box_var_pos.x = prior_box_var_c * 4; + prior_box_var_pos.y = prior_box_var_n * prior_box_var_h; + + target_box_pos.x = target_box_c * 4; + target_box_pos.y = target_box_n * target_box_h; + + output_pos.x = out_c * 4; + output_pos.y = out_n * out_h; + + CL_DTYPE4 prior_box_input[4]; + CL_DTYPE4 prior_box_var_input[4]; + CL_DTYPE4 target_box_input[4]; + + prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 0, prior_box_pos.y)); + prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 1, prior_box_pos.y)); + prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 2, prior_box_pos.y)); + prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler, + (int2)(prior_box_pos.x + 3, prior_box_pos.y)); + + prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y)); + prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y)); + prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y)); + prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler, + (int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y)); + + target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 0,target_box_pos.y)); + target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 1, target_box_pos.y)); + target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 2, target_box_pos.y)); + target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler, + (int2)(target_box_pos.x + 3, target_box_pos.y)); + + CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x; + CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x; + CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2; + CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2; + + CL_DTYPE4 target_box_center_x; + CL_DTYPE4 target_box_center_y; + CL_DTYPE4 target_box_width; + CL_DTYPE4 target_box_height; + CL_DTYPE4 output[4]; + + output[0] = 0.0f; + output[1] = 0.0f; + output[2] = 0.0f; + output[3] = 0.0f; + + target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x; + target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y; + target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width; + target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height; + + output[0].x = target_box_center_x.x - target_box_width.x/(half)2; + output[1].x = target_box_center_y.x - target_box_height.x/(half)2; + output[2].x = target_box_center_x.x + target_box_width.x/(half)2; + output[3].x = target_box_center_y.x + target_box_height.x/(half)2; + + if(out_C - out_c * 4 >= 2){ + target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x; + target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y; + target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width; + target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height; + output[0].y = target_box_center_x.y - target_box_width.y/(half)2; + output[1].y = target_box_center_y.y - target_box_height.y/(half)2; + output[2].y = target_box_center_x.y + target_box_width.y/(half)2; + output[3].y = target_box_center_y.y + target_box_height.y/(half)2; + } + if(out_C - out_c * 4 >= 3){ + target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x; + target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y; + target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width; + target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height; + output[0].z = target_box_center_x.z - target_box_width.z/(half)2; + output[1].z = target_box_center_y.z - target_box_height.z/(half)2; + output[2].z = target_box_center_x.z + target_box_width.z/(half)2; + output[3].z = target_box_center_y.z + target_box_height.z/(half)2; + } + if(out_C - out_c * 4 >= 4){ + target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x; + target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y; + target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width; + target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height; + output[0].w = target_box_center_x.w - target_box_width.w/(half)2; + output[1].w = target_box_center_y.w - target_box_height.w/(half)2; + output[2].w = target_box_center_x.w + target_box_width.w/(half)2; + output[3].w = target_box_center_y.w + target_box_height.w/(half)2; + } + + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]); +} diff --git a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl index 8cdec7beabafc2701b6522fcb6492eff76353279..73a089d7591b98486daac2d4aaa29fe4f2192134 100644 --- a/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/elementwise_mul_kernel.cl @@ -30,6 +30,143 @@ __kernel void elementwise_mul(__global image2d_t input, WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } +__kernel void channel_mul(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +// etc : 1 1 1 72 +// run time Y [value,0,0,0] * 72 +__kernel void channel_mul_d2(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias0; + int2 coords_bias1; + int2 coords_bias2; + int2 coords_bias3; + /* if (x == 0 && y == 0) { + CL_DTYPE4 b = (CL_DTYPE4){0, 0, 0, 0}; + #define PPI(j, k) \ + b = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2){j, k}); \ + printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \ + convert_float(b.y), convert_float(b.z), convert_float(b.w)); + for (int i = 0; i < 73; ++i) { + PPI(i, 0); + } + #undef PPI + }*/ + coords_bias0.x = x / w * 4; + coords_bias0.y = 0; + coords_bias1.x = x / w * 4 + 1; + coords_bias1.y = 0; + coords_bias2.x = x / w * 4 + 2; + coords_bias2.y = 0; + coords_bias3.x = x / w * 4 + 3; + coords_bias3.y = 0; + CL_DTYPE4 biase0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0); + CL_DTYPE4 biase1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1); + CL_DTYPE4 biase2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2); + CL_DTYPE4 biase3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3); + /* if (x == 0 && y == 0) { + printf("bias0={ %f , %f , %f , %f }\n ", + convert_float(biase0.x), convert_float(biase0.y), + convert_float(biase0.z), convert_float(biase0.w)); + printf("bias1={ %f , %f , %f , %f }\n ", + convert_float(biase1.x), convert_float(biase1.y), + convert_float(biase1.z), convert_float(biase1.w)); + printf("bias2={ %f , %f , %f , %f }\n ", + convert_float(biase2.x), convert_float(biase2.y), + convert_float(biase2.z), convert_float(biase2.w)); + printf("bias3={ %f , %f , %f , %f }\n ", + convert_float(biase3.x), convert_float(biase3.y), + convert_float(biase3.z), convert_float(biase3.w)); + }*/ + CL_DTYPE4 biase = {biase0.x, biase1.x, biase2.x, biase3.x}; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 output = mad(in, biase, 0); + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +// c 1 1 +__kernel void channel_mul_d3(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage, + int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +__kernel void channel_mul_d4(__global image2d_t input, +__global image2d_t bias, + __write_only image2d_t outputImage, int w) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + int2 coords_bias; + coords_bias.x = x / w; + coords_bias.y = 0; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + +#if 0 // TODO(ysh329): comment code below +__kernel void elementwise_mul(__global image2d_t input, + __global image2d_t bias, + __write_only image2d_t outputImage) { + int x = get_global_id(0); + int y = get_global_id(1); + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; + int2 coords; + coords.x = x; + coords.y = y; + CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords); + CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords); + CL_DTYPE4 output = in * biase; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); +} + __kernel void channel_mul_d1(__read_only image2d_t input, __read_only image2d_t bias, @@ -184,4 +321,4 @@ __kernel void channel_mul_d4(__read_only image2d_t input, WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output); } - +#endif diff --git a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl index b5346e3af4f472d5ed095d586cb68122655cf1c4..3e3d65394f9924edac735084c2fe5ce550f20684 100644 --- a/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/instance_norm_kernel.cl @@ -14,14 +14,127 @@ limitations under the License. */ #include +// onnx/pytorch instancenorm by lijian +__kernel void instance_norm_onnx(__private const int in_width, + __private const int in_height, + __private const int in_c_group, + __private const int local_work_size_x, + __private const int local_work_size_y, + __private const float epsilon, + __read_only image2d_t input, + __write_only image2d_t output) { + const int out_cn = get_global_id(0); + const int n = out_cn / in_c_group; + const int c = out_cn % in_c_group; + const int w = get_local_id(1); + const int h = get_local_id(2); + const int local_id = w * local_work_size_y + h; + const int local_total_size = local_work_size_x * local_work_size_y; -__kernel void instance_norm(__read_only image2d_t input, - __write_only image2d_t output, - __read_only image2d_t scale, - __read_only image2d_t bias, - const float epsilon, - const int in_h, - const int in_w){ + const sampler_t sampler = + CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; +#ifdef LOCAL_MEM_128 + __local float4 shared_mem[128]; +#elif defined(LOCAL_MEM_64) + __local float4 shared_mem[64]; +#else + __local float4 shared_mem[256]; +#endif + int xOffset = c * in_width; + int yOffset = n * in_height; + float4 sum = 0.0f; + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)); + } + } + shared_mem[local_id] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id < 32) { + for (int i = local_id + 32; i < local_total_size; i += 32) { + sum += shared_mem[i]; + } + } + shared_mem[local_id] += sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id == 0) { + int top = min(32, local_total_size); + for (int i = 0; i < top; i += 1) { + sum += shared_mem[i]; + } + shared_mem[0] = sum / (in_width * in_height); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const float4 mean_val = shared_mem[0]; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val; + sum += temp * temp; + } + } + shared_mem[local_id] = sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id < 32) { + for (int i = local_id + 32; i < local_total_size; i += 32) { + sum += shared_mem[i]; + } + } + shared_mem[local_id] += sum; + + barrier(CLK_LOCAL_MEM_FENCE); + + sum = 0.0f; + if (local_id == 0) { + int top = min(32, local_total_size); + for (int i = 0; i < top; i += 1) { + sum += shared_mem[i]; + } + shared_mem[0] = sum / (in_width * in_height); + } + + barrier(CLK_LOCAL_MEM_FENCE); + + const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon)); + + float4 s = 1 / sigma; + + for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) { + for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) { + int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex); + float4 in_val = read_imagef(input, sampler, intout_pos); + half4 out_val = convert_half4((in_val - mean_val) * s); +#ifdef RELU + out_val = activation(out_val); +#endif + write_imageh(output, intout_pos, out_val); + } + } +} + + +// paddle instancenorm by zhangxi +__kernel void instance_norm_paddle(__read_only image2d_t input, + __write_only image2d_t output, + __read_only image2d_t scale, + __read_only image2d_t bias, + const float epsilon, + const int in_h, + const int in_w){ __local CL_DTYPE4 saved_mean[1024]; __local CL_DTYPE4 saved_variance[1024]; const int lid = get_local_id(0); diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index 52009718803d7b98ebae481db547713e97b313c7..d5b2d70b09a84cb405c0e7c8f2b55f4254eb7f64 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,13 +26,15 @@ CLRuntime* CLRuntime::Global() { CLRuntime::~CLRuntime() { if (command_queue_ != nullptr) { + command_queue_->flush(); command_queue_->finish(); } - // For controlling the destruction order: + // For controlling the destruction order command_queue_.reset(); context_.reset(); device_.reset(); platform_.reset(); + device_info_.clear(); } bool CLRuntime::Init() { @@ -128,6 +127,12 @@ bool CLRuntime::InitializePlatform() { } bool CLRuntime::InitializeDevice() { + // ===================== BASIC ===================== + // CL_DEVICE_TYPE_GPU + // CL_DEVICE_NAME + // CL_DEVICE_SUPPORT + // CL_DEVICE_MAX_COMPUTE_UNITS + // CL_DEVICE_MAX_CLOCK_FREQUENCY std::vector all_devices; status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices); CL_CHECK_ERROR(status_); @@ -140,27 +145,153 @@ bool CLRuntime::InitializeDevice() { auto device_name = device_->getInfo(); LOG(INFO) << "Using device: " << device_name; + + cl_device_type device_type = device_->getInfo(); + auto device_type_to_str = [](cl_device_type t) -> std::string { + std::string t_str{""}; + switch (t) { + case CL_DEVICE_TYPE_CPU: + t_str = "CPU"; + break; + case CL_DEVICE_TYPE_GPU: + t_str = "GPU"; + break; + case CL_DEVICE_TYPE_ACCELERATOR: + t_str = "Accelerator"; + break; + case CL_DEVICE_TYPE_DEFAULT: + t_str = "Default"; + break; + default: + t_str = "Unknown"; + } + return t_str; + }; + LOG(INFO) << "device_type:" << device_type_to_str(device_type); + device_info_["CL_DEVICE_TYPE"] = device_type; + + auto max_units = device_->getInfo(); + LOG(INFO) << "The chosen device has " << max_units << " compute units."; + device_info_["CL_DEVICE_MAX_COMPUTE_UNITS"] = max_units; + + auto max_clock_freq = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_CLOCK_FREQUENCY:" << max_clock_freq; + device_info_["CL_DEVICE_MAX_CLOCK_FREQUENCY"] = max_clock_freq; + + // ===================== MEMORY ===================== + // CL_DEVICE_LOCAL_MEM_SIZE + // CL_DEVICE_GLOBAL_MEM_CACHE_SIZE + // CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE + // CL_DEVICE_GLOBAL_MEM_SIZE + auto local_mem_kb = + static_cast(device_->getInfo()) / 1024; + LOG(INFO) << "The local memory size of the chosen device is " << local_mem_kb + << " KB."; + device_info_["CL_DEVICE_LOCAL_MEM_SIZE_KB"] = local_mem_kb; + + auto global_mem_cache_size_kb = + static_cast(device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE(KB):" + << global_mem_cache_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_CACHE_SIZE_KB"] = global_mem_cache_size_kb; + + auto global_mem_cacheline_size_kb = + static_cast( + device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE(KB):" + << global_mem_cacheline_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE_KB"] = + global_mem_cacheline_size_kb; + + auto global_mem_size_kb = + static_cast(device_->getInfo()) / 1024; + LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_SIZE(KB):" << global_mem_size_kb << " KB."; + device_info_["CL_DEVICE_GLOBAL_MEM_SIZE_KB"] = global_mem_size_kb; + + // ===================== WORK_GROUP ===================== + // CL_DEVICE_MAX_WORK_GROUP_SIZE + // CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS + // CL_DEVICE_MAX_WORK_ITEM_SIZES + auto max_work_group_size = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_WORK_GROUP_SIZE:" << max_work_group_size; + device_info_["CL_DEVICE_MAX_WORK_GROUP_SIZE"] = max_work_group_size; + + auto max_dims_num = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:" << max_dims_num; + device_info_["CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"] = max_dims_num; + + auto max_work_item_sizes = device_->getInfo(); + for (size_t i = 0; i < max_work_item_sizes.size(); ++i) { + LOG(INFO) << "max_work_item_sizes[" << i << "]:" << max_work_item_sizes[i]; + std::string dim_key = "CL_DEVICE_MAX_WORK_ITEM_SIZES_" + std::to_string(i); + device_info_[dim_key] = max_work_item_sizes[i]; + } + + // ===================== BUFFER ===================== + // CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE + auto max_constant_buffer_size_kb = + static_cast( + device_->getInfo()) / + 1024; + LOG(INFO) << "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:" + << max_constant_buffer_size_kb; + device_info_["CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"] = + max_constant_buffer_size_kb; + + // ===================== IMAGE ===================== + // CL_DEVICE_IMAGE_SUPPORT + // CL_DEVICE_IMAGE2D_MAX_HEIGHT + // CL_DEVICE_IMAGE2D_MAX_WIDTH auto image_support = device_->getInfo(); if (image_support) { LOG(INFO) << "The chosen device supports image processing."; + device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 1; } else { LOG(INFO) << "The chosen device doesn't support image processing!"; + device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 0; return false; } + + auto image2d_max_height = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_HEIGHT:" << image2d_max_height; + device_info_["CL_DEVICE_IMAGE2D_MAX_HEIGHT"] = image2d_max_height; + + auto image2d_max_width = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_WIDTH:" << image2d_max_width; + device_info_["CL_DEVICE_IMAGE2D_MAX_WIDTH"] = image2d_max_width; + + // ===================== OTHERS / EXTENSION / VERSION ===================== + // CL_DEVICE_EXTENSIONS + // CL_DEVICE_ADDRESS_BITS auto ext_data = device_->getInfo(); VLOG(4) << "The extensions supported by this device: " << ext_data; if (ext_data.find("cl_khr_fp16") != std::string::npos) { LOG(INFO) << "The chosen device supports the half data type."; + device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 1; } else { LOG(INFO) << "The chosen device doesn't support the half data type!"; + device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 0; } - auto max_units = device_->getInfo(); - LOG(INFO) << "The chosen device has " << max_units << " compute units."; - auto local_mem = device_->getInfo(); - LOG(INFO) << "The local memory size of the chosen device is " - << static_cast(local_mem) / 1024 << " KB."; + + auto address_bits = device_->getInfo(); + LOG(INFO) << "CL_DEVICE_ADDRESS_BITS:" << address_bits; + device_info_["CL_DEVICE_ADDRESS_BITS"] = address_bits; + + auto driver_version = device_->getInfo(); + LOG(INFO) << "CL_DRIVER_VERSION:" << driver_version; + return true; } +std::map& CLRuntime::GetDeviceInfo() { + if (0 != device_info_.size()) { + return device_info_; + } + InitializeDevice(); + return device_info_; +} + } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 6683a5d92df02ae3a95f2e1b01feb2f303da8558..503b3a011642a8e018781c08647a958c521e6fac 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -1,11 +1,8 @@ /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -55,8 +52,10 @@ class CLRuntime { void set_cl_path(std::string cl_path) { cl_path_ = cl_path; } + std::map& GetDeviceInfo(); + private: - CLRuntime() = default; + CLRuntime() { Init(); } ~CLRuntime(); @@ -84,6 +83,8 @@ class CLRuntime { return queue; } + std::map device_info_; + std::string cl_path_; std::shared_ptr platform_{nullptr}; diff --git a/lite/backends/opencl/cl_utility.h b/lite/backends/opencl/cl_utility.h index b7f14c15e61ba050220ef0819fa9c3d13a7b8606..de01f896a6eb461eb24023a77935bba07de029e7 100644 --- a/lite/backends/opencl/cl_utility.h +++ b/lite/backends/opencl/cl_utility.h @@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error); __FILE__, \ __LINE__); \ } - +#ifndef LITE_SHUTDOWN_LOG #define CL_CHECK_FATAL(err_code__) \ if (err_code__ != CL_SUCCESS) { \ LOG(FATAL) << string_format( \ @@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error); __FILE__, \ __LINE__); \ } +#else +#define CL_CHECK_FATAL(err_code__) +#endif } // namespace lite } // namespace paddle diff --git a/lite/backends/rknpu/CMakeLists.txt b/lite/backends/rknpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cec60c80759cfc02e25a82eb795746c8b93e7cfe --- /dev/null +++ b/lite/backends/rknpu/CMakeLists.txt @@ -0,0 +1,5 @@ +if(NOT LITE_WITH_RKNPU) + return() +endif() + +lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs}) diff --git a/lite/backends/rknpu/device.cc b/lite/backends/rknpu/device.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b486259b3b328713062648df445f94735ae6380 --- /dev/null +++ b/lite/backends/rknpu/device.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/rknpu/device.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace rknpu { + +std::unique_ptr Device::Build( + std::string& model_name, // NOLINT + rk::nn::Graph* rk_graph, // NOLINT + std::vector> input_nodes, // NOLINT + std::vector> output_nodes // NOLINT + ) { + VLOG(3) << "[RKNPU] Build model"; + + rk_graph->SetInputsOutputs(input_nodes, output_nodes); + + std::unique_ptr exector = + std::unique_ptr(new rk::nn::Exection(rk_graph)); + + exector->Build(); + + return exector; +} + +} // namespace rknpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/rknpu/device.h b/lite/backends/rknpu/device.h new file mode 100644 index 0000000000000000000000000000000000000000..9284725aac7fbd9840aef64b7e8f411059f9ba15 --- /dev/null +++ b/lite/backends/rknpu/device.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "rknpu/rknpu_pub.h" // NOLINT + +namespace paddle { +namespace lite { +namespace rknpu { + +class Device { + public: + static Device& Global() { + static Device x; + return x; + } + Device() {} + + // Build the RK IR graph to om model, return RK model exector to + // load om model and run inference. + std::unique_ptr Build( + std::string& model_name, // NOLINT + rk::nn::Graph* rk_graph, // NOLINT + std::vector> input_nodes, // NOLINT + std::vector> output_nodes // NOLINT + ); // NOLINT + + private: +}; + +} // namespace rknpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/CMakeLists.txt b/lite/backends/x86/CMakeLists.txt index 63b41ae77d0f3949e3d1de13f9db5ca99b4f1c41..38b47ae3120608c7950a1f081e9ec2b133fb955e 100644 --- a/lite/backends/x86/CMakeLists.txt +++ b/lite/backends/x86/CMakeLists.txt @@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) endif(LITE_ON_MODEL_OPTIMIZE_TOOL) lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) -lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak) +lite_cc_library(x86_cpu_info SRCS cpu_info.cc) add_subdirectory(jit) add_subdirectory(math) diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc index a05a57e93b23008e49683764b5ed669d5c425e5b..2aaa798fa94b7dd47e4dc15d50e663b8fd3c083a 100644 --- a/lite/backends/x86/dynamic_loader.cc +++ b/lite/backends/x86/dynamic_loader.cc @@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() { void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib"); + return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib"); #elif defined(_WIN32) return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll"); #else diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc index 7d051aa6f5802844753b71fd43400e20b7f5965b..a3376be423828b25c6eda6fff30a56578c7bbbe5 100644 --- a/lite/backends/x86/jit/gen_base.cc +++ b/lite/backends/x86/jit/gen_base.cc @@ -28,6 +28,12 @@ #define posix_memalign_free free #endif +#ifdef _WIN32 +#define posix_memalign_free _aligned_free +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#endif + // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode"); @@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const { void* GenBase::operator new(size_t size) { void* ptr; constexpr size_t alignment = 32ul; +#ifdef _WIN32 + ptr = _aligned_malloc(size, alignment); +#else PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0, "GenBase Alloc %ld error!", size); +#endif PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); return ptr; } diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index 8d61fb3bbb97705c697fba934e6cab9424f85bad..9cf3281152840416dc141f98992499c663783b7a 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -96,8 +96,8 @@ class BeamSearchFunctor { // : nullptr; // fill in data - std::vector low_level; - size_t low_offset = 0; + std::vector low_level; + uint64_t low_offset = 0; for (auto &items : selected_items) { low_level.push_back(low_offset); for (auto &item : items) { diff --git a/lite/backends/x86/math/beam_search_test.cc b/lite/backends/x86/math/beam_search_test.cc index 904870207b08d462025ecb4b84d6cf57f7b13f26..233fa03fbaa31165dae4453affb148276f8c6584 100644 --- a/lite/backends/x86/math/beam_search_test.cc +++ b/lite/backends/x86/math/beam_search_test.cc @@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids, paddle::framework::LoDTensor* pre_scores) { // lod paddle::framework::LoD lod; - std::vector level0({0, 2, 4}); - std::vector level1({0, 1, 2, 3, 4}); + std::vector level0({0, 2, 4}); + std::vector level1({0, 1, 2, 3, 4}); lod.push_back(level0); lod.push_back(level1); ids->set_lod(lod); diff --git a/lite/backends/x86/math/blas_impl.h b/lite/backends/x86/math/blas_impl.h index 72d0736268f342187f0be8c6348f5bed75df30ea..34b258892be05625ae88076eff175f56a53d3537 100644 --- a/lite/backends/x86/math/blas_impl.h +++ b/lite/backends/x86/math/blas_impl.h @@ -483,7 +483,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data()); + mat_out->template mutable_data()); } template <> @@ -759,7 +759,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data()); + mat_out->template mutable_data()); } else { PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ || dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0); @@ -773,7 +773,7 @@ void Blas::MatMul(const lite::Tensor &mat_a, mat_a.data(), mat_b.data(), beta, - mat_out->mutable_data(), + mat_out->template mutable_data(), dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_, dim_a.stride_, dim_b.stride_); diff --git a/lite/backends/x86/math/concat_and_split.cc b/lite/backends/x86/math/concat_and_split.cc index bec93dde41fdb654cfbfd20f5d9e59d1d372e3a8..df75654aebaba26b9889d97445bd889cdf2f4eb0 100644 --- a/lite/backends/x86/math/concat_and_split.cc +++ b/lite/backends/x86/math/concat_and_split.cc @@ -51,7 +51,7 @@ class ConcatFunctor { // auto cpu_place = boost::get(context.GetPlace()); // computation - auto output_data = output->mutable_data(); + auto output_data = output->template mutable_data(); int col_idx = 0; for (int j = 0; j < num; ++j) { int col_len = input_cols[j]; @@ -108,7 +108,7 @@ class SplitFunctor { int col_len = output_cols[j]; auto* out_tensor = outputs->at(j); if (out_tensor != nullptr) { - T* dst_ptr = out_tensor->mutable_data() + k * col_len; + T* dst_ptr = out_tensor->template mutable_data() + k * col_len; std::copy_n(src_ptr + col_idx, col_len, dst_ptr); // memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, // sizeof(T) * col_len); diff --git a/lite/backends/x86/math/cross_entropy.cc b/lite/backends/x86/math/cross_entropy.cc index 366486924a8c4a5eefd6341183b4f1bc1c0277ad..941a34643669f060cdd18f38f92c39e529da7b19 100644 --- a/lite/backends/x86/math/cross_entropy.cc +++ b/lite/backends/x86/math/cross_entropy.cc @@ -50,8 +50,8 @@ class CrossEntropyFunctor { .reshape(batch_axis_remain) .sum(Eigen::DSizes(1))); } else { - const T* prob_data = prob->data(); - T* loss_data = out->mutable_data(); + const T* prob_data = prob->template data(); + T* loss_data = out->template mutable_data(); const int64_t* label_data = labels->data(); for (int i = 0; i < batch_size; ++i) { diff --git a/lite/backends/x86/math/im2col.cc b/lite/backends/x86/math/im2col.cc index 1c4c6a49f5bb804a57344c59368d18255e8a7912..b916c912ffc2a4d62b63b98fdce150b353ba087e 100644 --- a/lite/backends/x86/math/im2col.cc +++ b/lite/backends/x86/math/im2col.cc @@ -99,7 +99,7 @@ class Col2ImFunctormutable_data(); + T* im_data = im->template mutable_data(); const T* col_data = col.data(); for (int c = 0; c < channels_col; ++c) { @@ -161,7 +161,7 @@ class Im2ColFunctordims()[1]; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) { @@ -235,7 +235,7 @@ class Col2ImFunctormutable_data(); + T* im_data = im->template mutable_data(); const T* col_data = col.data(); for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) { diff --git a/lite/backends/x86/math/im2col_cfo_cpu.h b/lite/backends/x86/math/im2col_cfo_cpu.h index 4623f045bb1cbe67605b36621efcc3285b989ad5..97579647d4ec3a9a95e033a153417cb0aaadbeb6 100644 --- a/lite/backends/x86/math/im2col_cfo_cpu.h +++ b/lite/backends/x86/math/im2col_cfo_cpu.h @@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im, int channels_col = im_channels * filter_height * filter_width; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; int h_offset = (c / filter_width) % filter_height; @@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im, int output_width = col->dims()[4]; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); int col_matrix_width = output_width * output_height; int im_size = im_height * im_width; size_t copy_size = sizeof(T) * output_width; @@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im, constexpr int prw = 1; const T* im_data = im.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); int im_size = im_height * im_width; int col_matrix_width = output_width * output_height; int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow diff --git a/lite/backends/x86/math/math_function.cc b/lite/backends/x86/math/math_function.cc index a17807e8a997f0ecf908313a4cb205676e4fa4b8..05a10b5a19fbc8e80ee6dd07e67154d9cf6d1b22 100644 --- a/lite/backends/x86/math/math_function.cc +++ b/lite/backends/x86/math/math_function.cc @@ -65,7 +65,7 @@ struct TensorSetConstantCPU { : tensor_(tensor), value_(value) {} template void apply() const { - auto* begin = tensor_->mutable_data(lite::TargetType::kX86); + auto* begin = tensor_->template mutable_data(lite::TargetType::kX86); std::fill(begin, begin + tensor_->numel(), static_cast(value_)); } lite::Tensor* tensor_; @@ -126,7 +126,7 @@ struct RowwiseAdd { const T* input_data = input.data(); const T* vector_data = vector.data(); - T* output_data = output->mutable_data(); + T* output_data = output->template mutable_data(); for (int64_t i = 0; i < in_dims[0]; ++i) { for (int64_t j = 0; j < size; ++j) { output_data[i * in_dims[0] + j] = diff --git a/lite/backends/x86/math/math_function_impl.h b/lite/backends/x86/math/math_function_impl.h index 3aaca2e59370f8f2b922554ec6f378bb2a3de9b5..acfb76759f6fc9fa4122afd2388bc3adf8f5ea22 100644 --- a/lite/backends/x86/math/math_function_impl.h +++ b/lite/backends/x86/math/math_function_impl.h @@ -83,7 +83,7 @@ class ColwiseSum { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), size); - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { @@ -129,7 +129,7 @@ class RowwiseMean { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), height); auto inv_size = 1.0 / size; - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { @@ -173,7 +173,7 @@ class RowwiseSum { auto size = in_dims[1]; PADDLE_ENFORCE_EQ(out->numel(), height); - T* out_buf = out->mutable_data(out->target()); + T* out_buf = out->template mutable_data(out->target()); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { diff --git a/lite/backends/x86/math/maxouting.cc b/lite/backends/x86/math/maxouting.cc index 20b40fe7c5000cc1d0ee80c18efa5d1defc911f0..f97b16f7fb3326a6d2eb186e2984df3dbd0a0a90 100644 --- a/lite/backends/x86/math/maxouting.cc +++ b/lite/backends/x86/math/maxouting.cc @@ -35,7 +35,7 @@ class MaxOutFunctor { // c_size means the output size of each sample int c_size = fea_size * output_channels; const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; ++i) { int new_bindex = c_size * i; @@ -72,7 +72,8 @@ class MaxOutGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; ++i) { int blen = fea_size * output_channels * i; diff --git a/lite/backends/x86/math/pooling.cc b/lite/backends/x86/math/pooling.cc index ab6c1edb481f914d5751149aca2595fee550ca51..4393c42157bb7667ec2218e8b76f05a2c60bcc86 100644 --- a/lite/backends/x86/math/pooling.cc +++ b/lite/backends/x86/math/pooling.cc @@ -54,8 +54,8 @@ class Pool2dFunctor { const int input_stride = input_height * input_width; const int output_stride = output_height * output_width; - const T* input_data = input->data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + const T* input_data = input->template data(); + T* output_data = output->template mutable_data(lite::TargetType::kX86); int hstart, hend; int wstart, wend; @@ -137,7 +137,8 @@ class Pool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); int hstart, hend; int wstart, wend; @@ -220,7 +221,8 @@ class MaxPool2dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { @@ -322,7 +324,7 @@ class Pool3dFunctor { const int output_stride = output_depth * output_height * output_width; const T* input_data = input.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); int dstart, dend; int hstart, hend; @@ -425,7 +427,8 @@ class Pool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); int dstart, dend; int hstart, hend; @@ -530,7 +533,8 @@ class MaxPool3dGradFunctor { const T* input_data = input.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { diff --git a/lite/backends/x86/math/sample_prob.h b/lite/backends/x86/math/sample_prob.h index 5312b3df10a41444c073f0cf61d69bce6fc3859a..4351df68a2630c2b8c6f7285f3955a9b06165f67 100644 --- a/lite/backends/x86/math/sample_prob.h +++ b/lite/backends/x86/math/sample_prob.h @@ -58,11 +58,11 @@ class SampleWithProb { const int64_t* label_data = L->data(); // int64_t* samples_data = // S->mutable_data(ret_dim, Target); - // T* probabilities_data = P->mutable_data(ret_dim, Target); + // T* probabilities_data = P->template mutable_data(ret_dim, Target); S->Resize({batch_size, num_sampled_classes}); auto* samples_data = S->mutable_data(Target); P->Resize({batch_size, num_sampled_classes}); - auto* probabilities_data = P->mutable_data(Target); + auto* probabilities_data = P->template mutable_data(Target); // temp sets for unique sampling std::unordered_set tmp_samples; diff --git a/lite/backends/x86/math/search_fc.cc b/lite/backends/x86/math/search_fc.cc index 56fc363cb48ec5c58f4a7ee3e62a2e6bd7355021..014b213d4f10f7161dc1881d582cca93f2be58e5 100644 --- a/lite/backends/x86/math/search_fc.cc +++ b/lite/backends/x86/math/search_fc.cc @@ -42,7 +42,7 @@ class SearchFcFunctor { lite::DDim dims(std::vector({bottom.dims()[0], out_size})); const auto bottom_data = bottom.data(); - auto top_data = top->mutable_data(lite::TargetType::kX86); + auto top_data = top->template mutable_data(lite::TargetType::kX86); const auto weights = w.data(); auto blas = math::GetBlas(context); call_gemm(blas, diff --git a/lite/backends/x86/math/selected_rows_functor.cc b/lite/backends/x86/math/selected_rows_functor.cc index f8f1b42361832771ba04d1bdc8b3e2e05f954e29..acb377e31ccac96547fc4f0644332cfad36d66bc 100644 --- a/lite/backends/x86/math/selected_rows_functor.cc +++ b/lite/backends/x86/math/selected_rows_functor.cc @@ -52,7 +52,7 @@ struct SelectedRowsAdd { PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size()); PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size()); - auto* out_data = out_value->mutable_data(); + auto* out_data = out_value->template mutable_data(); auto* in1_data = in1_value.data(); std::copy_n(in1_data, in1_value.numel(), out_data); @@ -87,7 +87,7 @@ struct SelectedRowsAddTensor { functor(context, output, 0.0); auto* in1_data = in1_value.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); for (size_t i = 0; i < in1_rows.size(); i++) { for (int64_t j = 0; j < in1_row_numel; j++) { @@ -127,7 +127,7 @@ struct SelectedRowsAddTo { in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); auto* in1_data = in1_value.data(); - auto* in2_data = in2_value->mutable_data(); + auto* in2_data = in2_value->template mutable_data(); std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset); } }; @@ -161,7 +161,7 @@ struct SelectedRowsSumTo { input2->set_rows(in2_rows); auto* in2_value = input2->mutable_value(); - T* in2_data = in2_value->mutable_data(); + T* in2_data = in2_value->template mutable_data(); auto blas = math::GetBlas(context); size_t offset = 0u; for (size_t i = 0u; i != input1.size(); ++i) { @@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); - auto* input2_data = input2->mutable_data(); + auto* input2_data = input2->template mutable_data(); for (size_t i = 0; i < in1_rows.size(); i++) { for (int64_t j = 0; j < in1_row_numel; j++) { @@ -305,7 +305,7 @@ struct MergeAdd { lite::DDim dims(std::vector( {static_cast(merged_row_set.size()), input_width})); out.mutable_value()->Resize(dims); - auto* out_data = out.mutable_value()->mutable_data(); + auto* out_data = out.mutable_value()->template mutable_data(); if (merged_row_set.size() == row_num && !sorted_result) { // no duplicated ids, just concat the result together @@ -385,7 +385,7 @@ struct UpdateToTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height); auto* in1_data = in1_value.data(); - auto* input2_data = input2->data(); + auto* input2_data = input2->template data(); // FIXME(typhoonzero): use macro fix the below messy code. switch (op) { diff --git a/lite/backends/x86/math/sequence2batch.cc b/lite/backends/x86/math/sequence2batch.cc index c12c05414d717dce706590a491ccae2384f3bfe5..aa7aeac532e2fa1f90d452924b364be1896ee862 100644 --- a/lite/backends/x86/math/sequence2batch.cc +++ b/lite/backends/x86/math/sequence2batch.cc @@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor { public: void operator()(const lite::Context& context, const lite::Tensor& src, - const std::vector& index_lod, + const std::vector& index_lod, lite::Tensor* dst, bool is_src_index) { - const size_t* index = index_lod.data(); + const uint64_t* index = index_lod.data(); const auto& src_dims = src.dims(); const auto& dst_dims = dst->dims(); PADDLE_ENFORCE_EQ( @@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor { auto height = dst_dims[0]; auto width = dst_dims[1]; auto* src_data = src.data(); - auto* dst_data = dst->mutable_data(); + auto* dst_data = dst->template mutable_data(); const int sz = width * sizeof(T); if (is_src_index) { for (int i = 0; i < height; ++i) { diff --git a/lite/backends/x86/math/sequence2batch.h b/lite/backends/x86/math/sequence2batch.h index a70cc5bf73522f97ab312fc48553b5316dbf8376..63df008b6dfca936265019a71ac0a553c525dc73 100644 --- a/lite/backends/x86/math/sequence2batch.h +++ b/lite/backends/x86/math/sequence2batch.h @@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor { // The indexed rows are based on the input index. void operator()(const lite::Context& context, const lite::Tensor& src, - const std::vector& index_lod, + const std::vector& index_lod, lite::Tensor* dst, bool is_src_index); }; @@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor { // batch_lods[2] is the sort order for the input LoDTensor. batch_lods->at(2).resize(seq_info.size()); - size_t* batch_starts = batch_lods->at(0).data(); - size_t* seq2batch_idx = batch_lods->at(1).data(); + auto* batch_starts = batch_lods->at(0).data(); + auto* seq2batch_idx = batch_lods->at(1).data(); batch_starts[0] = 0; for (int n = 0; n < max_seqlen; n++) { auto batch_id = static_cast(batch_starts[n]); @@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor { } batch_starts[n + 1] = static_cast(batch_id); } - size_t* seq_order = batch_lods->at(2).data(); + auto* seq_order = batch_lods->at(2).data(); for (size_t i = 0; i < seq_info.size(); ++i) { seq_order[i] = seq_info[i].seq_idx; } diff --git a/lite/backends/x86/math/sequence_padding.cc b/lite/backends/x86/math/sequence_padding.cc index fbb6c11a5f7a0cbae36d2f8fba0b141dadadf542..eb977dc2d23f4cfaeec7dd5a6e2834ca23345f76 100644 --- a/lite/backends/x86/math/sequence_padding.cc +++ b/lite/backends/x86/math/sequence_padding.cc @@ -22,15 +22,15 @@ namespace math { template void CopyValidData(lite::Tensor* dst_tensor, const lite::Tensor* src_tensor, - const std::vector& seq_offsets, + const std::vector& seq_offsets, int pad_seq_len, int step_width, bool norm_by_len, CopyType type, PadLayout layout) { int seq_num = seq_offsets.size() - 1; - const T* src_data = src_tensor->data(); - T* dst_data = dst_tensor->mutable_data(); + const T* src_data = src_tensor->template data(); + T* dst_data = dst_tensor->template mutable_data(); int seq_cpy_gap = step_width; int pad_cpy_gap = @@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor { "'step_width'."); // fill padding value - T* pad_data = pad_tensor->mutable_data(); + T* pad_data = pad_tensor->template mutable_data(); const T* pad_value_data = pad_value.data(); if (pad_value.numel() == 1) { fast_mem_init( diff --git a/lite/backends/x86/math/sequence_padding.h b/lite/backends/x86/math/sequence_padding.h index a3f4512042de4c7a2fc665f2fd41777d472225f5..43407014dea0ed0c78ab29da7fb8ebb0e0310566 100644 --- a/lite/backends/x86/math/sequence_padding.h +++ b/lite/backends/x86/math/sequence_padding.h @@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth }; enum CopyType { kSeqToPad, kPadToSeq }; -inline static size_t MaximumSequenceLength( - const std::vector& seq_offset) { - size_t seq_num = seq_offset.size() - 1; - size_t max_seq_len = 0; +inline static uint64_t MaximumSequenceLength( + const std::vector& seq_offset) { + uint64_t seq_num = seq_offset.size() - 1; + uint64_t max_seq_len = 0; for (size_t i = 0; i < seq_num; ++i) { max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]); } @@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength( inline static void CheckDims(const lite::DDim& seq_tensor_dims, const lite::DDim& pad_tensor_dims, - const std::vector& seq_offset, + const std::vector& seq_offset, int64_t padded_seq_len, int64_t step_width, const PadLayout& layout) { diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc index 186b8b5543c7132867093616c83b45ae8ff27d3c..34c55c5714e467954bc1bb79d9b1385ef5cfe497 100644 --- a/lite/backends/x86/math/sequence_pooling.cc +++ b/lite/backends/x86/math/sequence_pooling.cc @@ -55,7 +55,7 @@ class MaxSeqPoolFunctor { auto starts = input.lod()[0]; const T* in_data = input.data(); - T* out_data = output->mutable_data(); + T* out_data = output->template mutable_data(); int* max_index = index->mutable_data(); int64_t num_seq = out_dims[0]; @@ -103,7 +103,7 @@ class MaxSeqPoolFunctor { auto starts = input.lod()[0]; const T* in_data = input.data(); - T* out_data = output->mutable_data(); + T* out_data = output->template mutable_data(); int64_t num_seq = out_dims[0]; int64_t dim = output->numel() / num_seq; @@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor { const T* og_data = out_grad.data(); const int* max_index = index.data(); - T* ig_data = in_grad->mutable_data(); + T* ig_data = in_grad->template mutable_data(); SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); @@ -170,7 +170,7 @@ class LastSeqPoolFunctor { lite::Tensor* output) { // Create pointers to input and output data auto* in_data = input.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); // Calculate the size of each item in sequence int64_t item_size = input.numel() / input.dims()[0]; @@ -203,7 +203,7 @@ class FirstSeqPoolFunctor { lite::Tensor* output) { // Create pointers to input and output data auto* in_data = input.data(); - auto* out_data = output->mutable_data(); + auto* out_data = output->template mutable_data(); // Calculate the size of each item in sequence int64_t item_size = input.numel() / input.dims()[0]; @@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor { int64_t in_w = in_grad->numel() / in_grad->dims()[0]; PADDLE_ENFORCE(in_w == out_w); const T* out_g_data = out_grad.data(); - T* in_g_data = in_grad->mutable_data(TARGET(kX86)); + T* in_g_data = in_grad->template mutable_data(TARGET(kX86)); auto blas = math::GetBlas(context); for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { int64_t h = static_cast(lod[i + 1] - lod[i]); @@ -288,7 +288,7 @@ class SequencePoolFunctor { auto lod = input.lod()[0]; if (pooltype == "SUM") { const T* src = input.data(); - T* dst = output->mutable_data(TARGET(kX86)); + T* dst = output->template mutable_data(TARGET(kX86)); jit::seq_pool_attr_t attr( static_cast(input.numel() / input.dims()[0]), jit::SeqPoolType::kSum); diff --git a/lite/backends/x86/math/sequence_pooling_test.cc b/lite/backends/x86/math/sequence_pooling_test.cc index a73014767345842f09ac2ff0cd5c2e7231c1f90a..b91f43a571994bef95650361a6dc62c0465837a7 100644 --- a/lite/backends/x86/math/sequence_pooling_test.cc +++ b/lite/backends/x86/math/sequence_pooling_test.cc @@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) { TEST(SequencePoolingGrad, CPU_SUM) { paddle::framework::LoD lod1; - lod1.push_back(std::vector{0, 10}); + lod1.push_back(std::vector{0, 10}); TestSequencePoolingSum(lod1); paddle::framework::LoD lod2; - lod2.push_back(std::vector{0, 2, 7, 10}); + lod2.push_back(std::vector{0, 2, 7, 10}); TestSequencePoolingSum(lod2); @@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) { #ifdef PADDLE_WITH_CUDA TEST(SequencePoolingGrad, CUDA_SUM) { paddle::framework::LoD lod1; - lod1.push_back(std::vector{0, 10}); + lod1.push_back(std::vector{0, 10}); TestSequencePoolingSum(lod1); paddle::framework::LoD lod2; - lod2.push_back(std::vector{0, 2, 7, 10}); + lod2.push_back(std::vector{0, 2, 7, 10}); TestSequencePoolingSum(lod2); diff --git a/lite/backends/x86/math/sequence_scale.cc b/lite/backends/x86/math/sequence_scale.cc index fad0628de15379b58847827cc3d48bf6085cbda2..25c7be0d0e2747f4f28c1d82f8855872d57726d1 100644 --- a/lite/backends/x86/math/sequence_scale.cc +++ b/lite/backends/x86/math/sequence_scale.cc @@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor { size_t seq_width = seq->dims()[1]; lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod); - T* seq_data = seq->mutable_data(lite::TargetType::kX86); + T* seq_data = seq->template mutable_data(lite::TargetType::kX86); for (size_t i = 0; i < num_seq; ++i) { for (size_t j = lod[level][i] * seq_width; j < lod[level][i + 1] * seq_width; diff --git a/lite/backends/x86/math/sequence_topk_avg_pooling.cc b/lite/backends/x86/math/sequence_topk_avg_pooling.cc index 035a7923c70f91cf27f1d845f68110f8f33cb73d..97e27fed59f4bc1a4c457ea9cf515da6caca9a1c 100644 --- a/lite/backends/x86/math/sequence_topk_avg_pooling.cc +++ b/lite/backends/x86/math/sequence_topk_avg_pooling.cc @@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor { auto pos_data = pos->mutable_data(lite::TargetType::kX86); int offset = 0; - std::vector vec_out_lod; + std::vector vec_out_lod; vec_out_lod.reserve(batch_size + 1); for (int i = 0; i <= batch_size; ++i) { offset = row_lod[i]; @@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor { out->set_lod(lod_temp); auto in_data = in.data(); - auto out_data = out->mutable_data(lite::TargetType::kX86); + auto out_data = out->template mutable_data(lite::TargetType::kX86); T* sum_data = new T[max_k]; for (int i = 0; i < batch_size; ++i) { diff --git a/lite/backends/x86/math/softmax_impl.h b/lite/backends/x86/math/softmax_impl.h index ec45377bc55154a4a36ebc5c3684ab7efeeef88e..1ba84dda42093155b10fa74a49e953d6663b8c88 100644 --- a/lite/backends/x86/math/softmax_impl.h +++ b/lite/backends/x86/math/softmax_impl.h @@ -108,8 +108,8 @@ class SoftmaxFunctor> { const int num_remain = num_classes / axis_dim; if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { - const T* in_data = X->data(); - auto* out_data = Y->mutable_data(); + const T* in_data = X->template data(); + auto* out_data = Y->template mutable_data(); for (int bs = 0; bs < batch_size; ++bs) { T max_val = *std::max_element(in_data, in_data + num_classes); max_val *= static_cast(-1); @@ -219,9 +219,9 @@ class SoftmaxGradFunctor> { const int num_remain = num_classes / axis_dim; if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) { - const T* out_data = y->data(); - const T* out_grad = y_grad->data(); - T* in_grad = x_grad->mutable_data(); + const T* out_data = y->template data(); + const T* out_grad = y_grad->template data(); + T* in_grad = x_grad->template mutable_data(); for (int bs = 0; bs < batch_size; ++bs) { T scalar; vec_mul_reduce( diff --git a/lite/backends/x86/math/tree2col.cc b/lite/backends/x86/math/tree2col.cc index 20b913331308c8b8c95d190b6b0b3d76ccac354b..bfc7084c9ff018101ca3dfc1d1748083b1449662 100644 --- a/lite/backends/x86/math/tree2col.cc +++ b/lite/backends/x86/math/tree2col.cc @@ -104,12 +104,12 @@ class Tree2ColFunctor { patch_size = processing_list.size(); // T *patch_data = - // patch->mutable_data({static_cast(patch_size), + // patch->template mutable_data({static_cast(patch_size), // static_cast(patch_elem_size)}, // cpu_place); patch->Resize({static_cast(patch_size), static_cast(patch_elem_size)}); - auto *patch_data = patch->mutable_data(lite::TargetType::kX86); + auto *patch_data = patch->template mutable_data(lite::TargetType::kX86); constant(context, patch, 0); const T *features = node_features.data(); @@ -166,12 +166,12 @@ class Col2TreeFunctor { } } // T *grad_data = - // in_grad->mutable_data({static_cast(node_count), + // in_grad->template mutable_data({static_cast(node_count), // static_cast(grad_elem_size)}, // cpu_place); in_grad->Resize({static_cast(node_count), static_cast(grad_elem_size)}); - auto *grad_data = in_grad->mutable_data(lite::TargetType::kX86); + auto *grad_data = in_grad->template mutable_data(lite::TargetType::kX86); constant(context, in_grad, 0); const T *out_g = out_grad.data(); diff --git a/lite/backends/x86/math/unpooling.cc b/lite/backends/x86/math/unpooling.cc index 568f9952cab755c8441695e1a9266a2001d2b9a9..119d7294e9ec21e67f09776ad20d04f15b8b81ce 100644 --- a/lite/backends/x86/math/unpooling.cc +++ b/lite/backends/x86/math/unpooling.cc @@ -36,7 +36,7 @@ class Unpool2dMaxFunctor { int output_feasize = output_height * output_width; const T* input_data = input.data(); const int* indices_data = indices.data(); - T* output_data = output->mutable_data(lite::TargetType::kX86); + T* output_data = output->template mutable_data(lite::TargetType::kX86); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { @@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor { int output_feasize = output_height * output_width; const int* indices_data = indices.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(lite::TargetType::kX86); + T* input_grad_data = + input_grad->template mutable_data(lite::TargetType::kX86); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { diff --git a/lite/backends/x86/math/vol2col.cc b/lite/backends/x86/math/vol2col.cc index 8fd5e8954e2010d5226d56ac4a87a44e6364c8c6..91979bb7fdcfe66d84ded3f9797144ddafc8769e 100644 --- a/lite/backends/x86/math/vol2col.cc +++ b/lite/backends/x86/math/vol2col.cc @@ -75,7 +75,7 @@ class Vol2ColFunctor { "mismatching."); const T* vol_data = vol.data(); - T* col_data = col->mutable_data(); + T* col_data = col->template mutable_data(); for (int c = 0; c < channels_col; ++c) { int w_offset = c % filter_width; @@ -159,7 +159,7 @@ class Col2VolFunctor { output_width, "input_width and output_width are " "mismatching."); - T* vol_data = vol->mutable_data(); + T* vol_data = vol->template mutable_data(); const T* col_data = col.data(); for (int c = 0; c < channels_col; ++c) { diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h index 0689ec4c234509cee6f10f8e0f7dd432edae5c4e..49794b8e15a8f90a6512798baa842534df879f6b 100644 --- a/lite/backends/x86/parallel.h +++ b/lite/backends/x86/parallel.h @@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() { // Do not support nested omp parallem. num_threads = omp_in_parallel() ? 1 : omp_get_max_threads(); #endif - return std::max(num_threads, 1L); + return std::max(num_threads, 1L); } using ThreadHandler = diff --git a/lite/backends/x86/port.h b/lite/backends/x86/port.h index c1b81159aca979efe4b46777a1cef49e44b95e27..0e1e2b77b796eae201c55edcd3caecc263e4271e 100644 --- a/lite/backends/x86/port.h +++ b/lite/backends/x86/port.h @@ -14,10 +14,10 @@ #pragma once +#include #include #include -#include #include #include @@ -37,7 +37,9 @@ #define GOOGLE_GLOG_DLL_DECL #include // _popen, _pclose #include +#define NOMINMAX // msvc max/min macro conflict with std::min/max #include +#include #include // std::accumulate in msvc #ifndef S_ISDIR // windows port for sys/stat.h #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) @@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) { return reinterpret_cast(hModule); } +extern struct timeval; static int gettimeofday(struct timeval *tp, void *tzp) { time_t clock; struct tm tm; diff --git a/lite/backends/xpu/CMakeLists.txt b/lite/backends/xpu/CMakeLists.txt index 4491fdeaefe9f16265bdee2c07ebb02b86a2b038..85bef0452c41ce35c90d9bd058bb7fdefd030f3a 100644 --- a/lite/backends/xpu/CMakeLists.txt +++ b/lite/backends/xpu/CMakeLists.txt @@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU) return() endif() -lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) +if(LITE_WITH_XTCL) + lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) +endif() +lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs}) diff --git a/lite/backends/xpu/device.h b/lite/backends/xpu/device.h index 6de18d5466da6e6b791363d2e275ea72376c78b8..a2cc3206d3d0391d89690026561f47983e9376c9 100644 --- a/lite/backends/xpu/device.h +++ b/lite/backends/xpu/device.h @@ -14,12 +14,12 @@ #pragma once -#include #include #include #include #include #include +#include "lite/backends/xpu/xpu_header_sitter.h" namespace paddle { namespace lite { diff --git a/lite/backends/xpu/math.h b/lite/backends/xpu/math.h new file mode 100644 index 0000000000000000000000000000000000000000..48352736d45a20d9abd496d9dd10b000d3f15a28 --- /dev/null +++ b/lite/backends/xpu/math.h @@ -0,0 +1,219 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace xpu { +namespace math { + +static inline long round_half_to_even(const float src) { // NOLINT + long ret = llround(src); // NOLINT + if (fabs(fabs(round(src) - src) - 0.5) > 0) { + return ret; + } else { + if (abs(ret) % 2 == 0) { + return ret; + } else { + return ret + (ret > 0 ? -1 : 1); + } + } +} + +static float ieee_compliance_0(float f) { + uint32_t *ptr = reinterpret_cast(&f); + uint32_t sign = (*ptr) & 0x80000000; + uint32_t uf = 0; + // nan -> inf + if (std::isnan(f)) { + uf = (sign | 0x7F800000); + float *ptr = reinterpret_cast(&uf); + return *ptr; + } else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) { + return f; + } else { + // denormal -> +-0 + uf = 0x0; + float *ptr = reinterpret_cast(&uf); + return *ptr; + } +} + +template +static inline T fp32_to_intx(const float f, float max) { + max = ieee_compliance_0(max); + float input = ieee_compliance_0(f); + // +0 and -0 -> +0 + if (input == 0) { + input = 0.0f; + } + + float tmp = RMAX / max; + if (std::isinf(tmp)) { + uint32_t *ptr = reinterpret_cast(&input); + if ((*ptr) >> 31 & 1) { + return T(-RMAX); + } else { + return T(RMAX); + } + } + + tmp = input * tmp; + if (std::isnan(tmp)) { + return T(RMAX); + } + + tmp = ieee_compliance_0(tmp); + // early check to avoid INF or big value get into convertor func. + if (tmp > RMAX) { + return T(RMAX); + } + if (tmp < -RMAX) { + return T(-RMAX); + } + T ret = (T)round_half_to_even(tmp); + if (ret > RMAX) { + ret = T(RMAX); + } + if (ret < -RMAX) { + ret = T(-RMAX); + } + return ret; +} + +static inline int16_t fp32_to_int16(const float f, float max) { + int16_t v1 = fp32_to_intx(f, max); + return v1; +} + +static inline int ConvertFP32ToInt16(const void *input, + void *output, + float max_val, + int len) { + for (int i = 0; i < len; i++) { + static_cast(output)[i] = + fp32_to_int16(static_cast(input)[i], max_val); + } + return 0; +} + +static inline float FindMaxAbs(const float *data, int len) { + float max_f = 0.0f; + for (int i = 0; i < len; ++i) { + float max = std::abs(data[i]); + if (max > max_f) { + max_f = max; + } + } + return max_f; +} + +template +static inline void Transpose(const T *in, T *out, int h, int w) { + for (int h1 = 0; h1 < w; ++h1) { + for (int w1 = 0; w1 < h; ++w1) { + out[h1 * h + w1] = in[w1 * w + h1]; + } + } +} + +/** + * Get row matrix shape from a vector shape. If the rank of x_dim > 1, the + * original x_dim is returned. + */ +static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) { + if (x_dim.size() > 1) { + return x_dim; + } + return lite::DDim({1, x_dim[0]}); +} + +/** + * Get column matrix shape from a vector shape. If the rank of y_dim > 1, the + * original y_dim is returned. + */ +static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) { + if (y_dim.size() > 1) { + return y_dim; + } + return lite::DDim({y_dim[0], 1}); +} + +/** + * Matrix Descriptor of a memory buffer. + * + * It is used for Blas::MatMul. MatMul operator can be batched. + * if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a + * `batch_size` times of GEMM. The batched GEMM could be faster base on the + * implementation of the blas library. The batch size could be zero. If any + * matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g., + * Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be + * [BatchSize, H1, W2] + * + * The boolean flag, `trans`, describe the memory is the transpose of matrix or + * not. If the trans is true, the last two dims of matrix are transposed. The + * memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height]. + * + * The MatDescriptor is not only the dimension or shape of a matrix, it also + * contains the layout, stride of matrix. It is clearer to have a structure than + * reuse `DDim`. + */ +struct MatDescriptor { + int64_t height_; + int64_t width_; + int64_t stride_{0}; + int64_t batch_size_{0}; + bool trans_; +}; + +static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, + int num_flatten_cols, + bool trans) { + MatDescriptor retv; + if (num_flatten_cols > 1) { + auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); + retv.height_ = flatten_dim[0]; + retv.width_ = flatten_dim[1]; + } else { + if (tensor_dim.size() == 2) { + retv.height_ = tensor_dim[0]; + retv.width_ = tensor_dim[1]; + } else { + auto dim_vec = tensor_dim.Vectorize(); + retv.batch_size_ = 1; + for (size_t i = 0; i < dim_vec.size() - 2; ++i) { + retv.batch_size_ *= dim_vec[i]; + } + retv.height_ = dim_vec[dim_vec.size() - 2]; + retv.width_ = dim_vec[dim_vec.size() - 1]; + retv.stride_ = retv.height_ * retv.width_; + } + } + if (trans) { + std::swap(retv.width_, retv.height_); + } + retv.trans_ = trans; + return retv; +} + +} // namespace math +} // namespace xpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.cc b/lite/backends/xpu/target_wrapper.cc new file mode 100644 index 0000000000000000000000000000000000000000..5dcbc1e275cca8c32003cbef74dfb1f6d4caee93 --- /dev/null +++ b/lite/backends/xpu/target_wrapper.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/xpu/target_wrapper.h" +#include "lite/backends/xpu/xpu_header_sitter.h" + +namespace paddle { +namespace lite { + +void* TargetWrapperXPU::Malloc(size_t size) { + void* ptr{nullptr}; + xpu_malloc(&ptr, size); + return ptr; +} + +void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); } + +void TargetWrapperXPU::MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir) { + switch (dir) { + case IoDirection::HtoD: + xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE); + break; + case IoDirection::DtoH: + xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST); + break; + default: + LOG(FATAL) << "Unsupported IoDirection " << static_cast(dir); + } +} + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/target_wrapper.h b/lite/backends/xpu/target_wrapper.h new file mode 100644 index 0000000000000000000000000000000000000000..c42d4139246085d8b9a367b45b60699209d0b668 --- /dev/null +++ b/lite/backends/xpu/target_wrapper.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "lite/core/target_wrapper.h" + +namespace paddle { +namespace lite { + +using TargetWrapperXPU = TargetWrapper; + +template <> +class TargetWrapper { + public: + static size_t num_devices() { return 1; } + static size_t maximum_stream() { return 0; } + + static void* Malloc(size_t size); + static void Free(void* ptr); + + static void MemcpySync(void* dst, + const void* src, + size_t size, + IoDirection dir); +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/xpu/xpu_header_sitter.h b/lite/backends/xpu/xpu_header_sitter.h new file mode 100644 index 0000000000000000000000000000000000000000..875e67d57d4ba2110bfbffb7ee9d1d6a876060fa --- /dev/null +++ b/lite/backends/xpu/xpu_header_sitter.h @@ -0,0 +1,32 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#pragma GCC system_header +#include +#include +#include + +#if defined(LITE_WITH_XTCL) +#include +#endif + +namespace paddle { +namespace lite { + +namespace xdnn = baidu::xpu::api; + +} // namespace lite +} // namespace paddle diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 35aad501070282b49cdd8df72185ad9d21dab9fe..6bd353a9e13bdfbd1fce0291e04f4b5925b18ac1 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -5,6 +5,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc DEPS target_wrapper_host place X86_DEPS target_wrapper_x86 CUDA_DEPS target_wrapper_cuda + XPU_DEPS target_wrapper_xpu CL_DEPS cl_target_wrapper FPGA_DEPS fpga_target_wrapper BM_DEPS target_wrapper_bm @@ -37,7 +38,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor) if (LITE_WITH_ARM) lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context) else() -lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context) +lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context CUDA_DEPS cuda_context) endif() #-------------------------------------------- GET CODE META INFO ------------------------------------------ diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index afc104073684ff00395fb32335630705ff3f7bc8..75971570fb078ce4e39413e5b3df629fe2a7ac3e 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/context.cc b/lite/core/context.cc index be886168e02e21d192305d701110ce5075ffba63..be41aa6eb0cb986760f38eaa2bb5b7e017cc4edb 100644 --- a/lite/core/context.cc +++ b/lite/core/context.cc @@ -15,5 +15,11 @@ #include "lite/core/context.h" namespace paddle { -namespace lite {} // namespace lite +namespace lite { + +#ifdef LITE_WITH_XPU +thread_local xdnn::Context* Context::_tls_raw_ctx{nullptr}; +#endif + +} // namespace lite } // namespace paddle diff --git a/lite/core/context.h b/lite/core/context.h index 6b826fe46f973d9812d76802a48b6d63f16b5081..7ab45bae1d3b3ff518ffa7a1db61cd1f56c92728 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -16,8 +16,7 @@ #include "lite/utils/any.h" #ifdef LITE_WITH_CUDA -#include "lite/backends/cuda/blas.h" -#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/context.h" #endif #ifdef LITE_WITH_OPENCL #include @@ -29,6 +28,9 @@ #include #include "lite/backends/mlu/mlu_utils.h" #endif +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/xpu_header_sitter.h" +#endif #include #include @@ -50,7 +52,6 @@ class Context; using HostContext = Context; using X86Context = Context; -using CUDAContext = Context; using ARMContext = Context; using NPUContext = Context; using XPUContext = Context; @@ -58,6 +59,7 @@ using OpenCLContext = Context; using FPGAContext = Context; using BMContext = Context; using MLUContext = Context; +using RKNPUContext = Context; template <> class Context { @@ -102,17 +104,59 @@ class Context { }; #endif +#ifdef LITE_WITH_RKNPU +template <> +class Context { + public: + Context() {} + explicit Context(const RKNPUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(RKNPUContext* ctx) {} + + RKNPUContext& operator=(const RKNPUContext& ctx) {} + std::string name() const { return "RKNPUContext"; } +}; +#endif + #ifdef LITE_WITH_XPU template <> class Context { public: Context() {} explicit Context(const XPUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler void InitOnce() {} + void CopySharedTo(XPUContext* ctx) {} + static xdnn::Context* GetRawContext() { + if (_tls_raw_ctx == nullptr) { + _tls_raw_ctx = xdnn::create_context(); + CHECK(_tls_raw_ctx); + } + return _tls_raw_ctx; + } + + static void SetWorkspaceL3Size(int l3_size = 0xfffc00) { + xdnn::set_workspace_l3_size(GetRawContext(), l3_size); + } + + static void SetDev(int dev_no = 0) { + const char* dev_env = getenv("LITE_XPU_DEV"); + if (dev_env) { + xpu_set_device(atoi(dev_env)); + return; + } + + xpu_set_device(dev_no); + } + std::string name() const { return "XPUContext"; } + + private: + static thread_local xdnn::Context* _tls_raw_ctx; }; #endif @@ -227,12 +271,10 @@ class Context { void SetIoQueue(cnrtQueue_t queue) { io_queue_ = queue; } cnmlCoreVersion_t MLUCoreVersion() { - return paddle::lite::TargetWrapperMlu::MLUCoreVersion(); + return DeviceInfo::Global().MLUCoreVersion(); } - int MLUCoreNumber() { - return paddle::lite::TargetWrapperMlu::MLUCoreNumber(); - } + int MLUCoreNumber() { return DeviceInfo::Global().MLUCoreNumber(); } u32_t affinity() { return affinity_; } @@ -258,99 +300,6 @@ class Context { }; #endif // LITE_WITH_MLU -#ifdef LITE_WITH_CUDA -// Only works with CUDA kernels. -template <> -class Context { - public: - typename Env::Devs& devs = - Env::Global(); - // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() { - cublas_fp32_ = std::make_shared>(); - } - void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { - CHECK_GT(devs.size(), 0UL) - << "Env is not initialized or current target is not exit!"; - if (dev_id >= static_cast(devs.size())) { - LOG(WARNING) << "device index exceeds the number of devices, set to " - "default device(0)!"; - device_id_ = 0; - } else { - device_id_ = dev_id; - } - if (io_stream_id >= devs[dev_id].max_stream()) { - LOG(WARNING) << "data stream index exceeds the maximum stream number, " - "set to default stream(0)!"; - io_stream_id = 0; - } - if (exec_stream_id >= devs[dev_id].max_stream()) { - LOG(WARNING) << "exec stream index exceeds the maximum stream number, " - "set to default stream(0)!"; - exec_stream_id = 0; - } - - exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id]; - io_stream_ = devs[dev_id].io_streams()[io_stream_id]; - - exec_stream_id_ = exec_stream_id; - io_stream_id_ = io_stream_id; - } - void CopySharedTo(CUDAContext* ctx) { - CHECK(ctx); - CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; - ctx->cublas_fp32_ = cublas_fp32_; - } - - const cudaStream_t& exec_stream() const { return exec_stream_; } - void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } - - const cudaStream_t& io_stream() const { return io_stream_; } - void SetIoStream(cudaStream_t stream) { io_stream_ = stream; } - - std::shared_ptr> cublas_fp32() { return cublas_fp32_; } - void SetCuBlasFP32(std::shared_ptr> cublas_fp32) { - cublas_fp32_ = cublas_fp32; - } - - const std::vector& input_events() { return input_events_; } - void SetInputEvents(const std::vector& input_events) { - input_events_.clear(); - input_events_.assign(input_events.begin(), input_events.end()); - } - - const std::vector& output_events() { return output_events_; } - void SetOutputEvents(const std::vector& output_events) { - output_events_.clear(); - output_events_.assign(output_events.begin(), output_events.end()); - } - - std::string name() const { return "CUDAContext"; } - - CUDAContext& operator=(const CUDAContext& context) { - this->Init( - context.device_id_, context.exec_stream_id_, context.io_stream_id_); - cublas_fp32_ = const_cast(context).cublas_fp32(); - return *this; - } - - private: - int device_id_; - // overall information - int exec_stream_id_; - int io_stream_id_; - cudaStream_t exec_stream_; - cudaStream_t io_stream_; - - // not thread-safe, should allocate for each thread. - std::shared_ptr> cublas_fp32_; - - // kernel information - std::vector input_events_; - std::vector output_events_; -}; -#endif - #ifdef LITE_WITH_X86 template <> class Context { @@ -423,7 +372,9 @@ class ContextScheduler { return *x; } - std::unique_ptr NewContext(TargetType target) { + std::unique_ptr NewContext( + TargetType target, + /*only used for cuda context*/ int exec_stream_id = 0) { std::unique_ptr ctx(new KernelContext); switch (target) { case TARGET(kHost): @@ -440,7 +391,7 @@ class ContextScheduler { case TARGET(kCUDA): { int dev_id = TargetWrapper::GetCurDevice(); auto& context = ctx->As(); - context.Init(dev_id); + context.Init(dev_id, exec_stream_id); kernel_contexts_[TargetType::kCUDA].As().CopySharedTo( &context); } break; @@ -457,6 +408,12 @@ class ContextScheduler { &ctx->As()); break; #endif +#ifdef LITE_WITH_RKNPU + case TARGET(kRKNPU): + kernel_contexts_[TargetType::kRKNPU].As().CopySharedTo( + &ctx->As()); + break; +#endif #ifdef LITE_WITH_XPU case TARGET(kXPU): kernel_contexts_[TargetType::kXPU].As().CopySharedTo( @@ -526,6 +483,9 @@ class ContextScheduler { #ifdef LITE_WITH_NPU InitContext(); #endif +#ifdef LITE_WITH_RKNPU + InitContext(); +#endif #ifdef LITE_WITH_XPU InitContext(); #endif diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 6d856b91888e652568fdae0452345e4dadaa069c..29ac96ed744b016833a746b35002dd68109efd8b 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -66,6 +66,15 @@ thread_local std::vector DeviceInfo::active_ids_; thread_local TensorLite DeviceInfo::workspace_; thread_local int64_t DeviceInfo::count_ = 0; +#ifdef LITE_WITH_MLU +thread_local cnmlCoreVersion_t DeviceInfo::mlu_core_version_{CNML_MLU270}; +thread_local int DeviceInfo::mlu_core_number_{1}; +thread_local bool DeviceInfo::use_first_conv_{false}; +thread_local std::vector DeviceInfo::mean_vec_; +thread_local std::vector DeviceInfo::std_vec_; +thread_local DataLayoutType DeviceInfo::input_layout_{DATALAYOUT(kNCHW)}; +#endif + #ifdef TARGET_IOS const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; @@ -1080,6 +1089,45 @@ int DeviceInfo::Setup() { return 0; } +#ifdef LITE_WITH_MLU +void DeviceInfo::SetMLURunMode(lite_api::MLUCoreVersion core_version, + int core_number, + bool use_first_conv, + const std::vector& mean_vec, + const std::vector& std_vec, + DataLayoutType input_layout) { + switch (core_version) { + case (lite_api::MLUCoreVersion::MLU_220): + mlu_core_version_ = CNML_MLU220; + break; + case (lite_api::MLUCoreVersion::MLU_270): + mlu_core_version_ = CNML_MLU270; + break; + default: + mlu_core_version_ = CNML_MLU270; + break; + } + mlu_core_number_ = core_number; + use_first_conv_ = use_first_conv; + mean_vec_ = mean_vec; + std_vec_ = std_vec; + input_layout_ = input_layout; +} + +cnmlCoreVersion_t DeviceInfo::MLUCoreVersion() { return mlu_core_version_; } + +int DeviceInfo::MLUCoreNumber() { return mlu_core_number_; } + +bool DeviceInfo::UseFirstConv() { return use_first_conv_; } + +const std::vector& DeviceInfo::MeanVec() const { return mean_vec_; } + +const std::vector& DeviceInfo::StdVec() const { return std_vec_; } + +DataLayoutType DeviceInfo::InputLayout() const { return input_layout_; } + +#endif // LITE_WITH_MLU + void DeviceInfo::SetRunMode(lite_api::PowerMode mode, int thread_num) { #ifdef ARM_WITH_OMP thread_num = std::min(thread_num, core_num_); diff --git a/lite/core/device_info.h b/lite/core/device_info.h index 5f5c4259e9614e74e04f61983031abf32a5a1621..b06eb8d944735971133bb7a29aa0f06075e60626 100644 --- a/lite/core/device_info.h +++ b/lite/core/device_info.h @@ -55,6 +55,20 @@ class DeviceInfo { int Setup(); void SetRunMode(lite_api::PowerMode mode, int thread_num); +#ifdef LITE_WITH_MLU + void SetMLURunMode(lite_api::MLUCoreVersion core_version, + int core_number, + bool use_first_conv, + const std::vector& mean_vec, + const std::vector& std_vec, + DataLayoutType input_layout); + cnmlCoreVersion_t MLUCoreVersion(); + int MLUCoreNumber(); + bool UseFirstConv(); + const std::vector& MeanVec() const; + const std::vector& StdVec() const; + DataLayoutType InputLayout() const; +#endif void SetCache(int l1size, int l2size, int l3size); void SetArch(ARMArch arch) { arch_ = arch; } @@ -106,6 +120,15 @@ class DeviceInfo { static thread_local TensorLite workspace_; static thread_local int64_t count_; +#ifdef LITE_WITH_MLU + static thread_local cnmlCoreVersion_t mlu_core_version_; + static thread_local int mlu_core_number_; + static thread_local bool use_first_conv_; + static thread_local std::vector mean_vec_; + static thread_local std::vector std_vec_; + static thread_local DataLayoutType input_layout_; +#endif + void SetDotInfo(int argc, ...); void SetFP16Info(int argc, ...); void SetFP32Info(int argc, ...); @@ -136,7 +159,7 @@ class Env { static Devs* devs = new Devs(); return *devs; } - static void Init(int max_stream = 4) { + static void Init(int max_stream = 6) { #ifdef LITE_WITH_MLU CNRT_CALL(cnrtInit(0)); #endif @@ -148,10 +171,11 @@ class Env { // Get device count count = API::num_devices(); if (count == 0) { - CHECK(false) << "No device found!"; + LOG(INFO) << "No " << TargetToStr(Type) << " device(s) found!"; } else { LOG(INFO) << "Found " << count << " device(s)"; } + CHECK_GT(max_stream, 0) << "max_stream must be greater than 0."; // create all device for (int i = 0; i < count; i++) { auto dev = Device(i, max_stream); @@ -211,8 +235,8 @@ class Device { std::string name() { return device_prop_.name; } int core_num() { return device_prop_.multiProcessorCount; } float max_memory() { return device_prop_.totalGlobalMem / 1048576.; } - std::vector exec_streams() { return exec_stream_; } - std::vector io_streams() { return io_stream_; } + const std::vector& exec_streams() { return exec_stream_; } + const std::vector& io_streams() { return io_stream_; } int sm_version() { return sm_version_; } bool has_fp16() { return has_fp16_; } diff --git a/lite/core/memory.cc b/lite/core/memory.cc index dfe07cdc4b2750893815ed64445fe7672dcdb6c8..1f2f7fed7d61b67a76f54a092b6d48951bc9fcbd 100644 --- a/lite/core/memory.cc +++ b/lite/core/memory.cc @@ -50,13 +50,18 @@ void* TargetMalloc(TargetType target, size_t size) { data = TargetWrapper::Malloc(size); break; #endif // LITE_WITH_MLU +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + data = TargetWrapperXPU::Malloc(size); + break; +#endif // LITE_WITH_XPU default: LOG(FATAL) << "Unknown supported target " << TargetToStr(target); } return data; } -void TargetFree(TargetType target, void* data) { +void TargetFree(TargetType target, void* data, std::string free_flag) { switch (target) { case TargetType::kHost: case TargetType::kX86: @@ -71,7 +76,11 @@ void TargetFree(TargetType target, void* data) { #endif // LITE_WITH_CUDA #ifdef LITE_WITH_OPENCL case TargetType::kOpenCL: - TargetWrapperCL::Free(data); + if (free_flag == "cl_use_image2d_") { + TargetWrapperCL::FreeImage(data); + } else { + TargetWrapperCL::Free(data); + } break; #endif // LITE_WITH_OPENCL #ifdef LITE_WITH_FPGA @@ -89,6 +98,11 @@ void TargetFree(TargetType target, void* data) { TargetWrapper::Free(data); break; #endif // LITE_WITH_MLU +#ifdef LITE_WITH_XPU + case TargetType::kXPU: + TargetWrapperXPU::Free(data); + break; +#endif // LITE_WITH_XPU default: LOG(FATAL) << "Unknown type"; } diff --git a/lite/core/memory.h b/lite/core/memory.h index 5a56f73b0de0fce64905f483ded88eda9ceffd52..a1013910019251271ddfccfbc700297c45226fe6 100644 --- a/lite/core/memory.h +++ b/lite/core/memory.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once +#include #include "lite/api/paddle_place.h" #include "lite/core/target_wrapper.h" #include "lite/utils/logging.h" @@ -34,6 +35,10 @@ #include "lite/backends/mlu/target_wrapper.h" #endif // LITE_WITH_MLU +#ifdef LITE_WITH_XPU +#include "lite/backends/xpu/target_wrapper.h" +#endif // LITE_WITH_XPU + namespace paddle { namespace lite { @@ -43,7 +48,9 @@ LITE_API void* TargetMalloc(TargetType target, size_t size); // Free memory for a specific Target. All the targets should be an element in // the `switch` here. -void LITE_API TargetFree(TargetType target, void* data); +void LITE_API TargetFree(TargetType target, + void* data, + std::string free_flag = ""); // Copy a buffer from host to another target. void TargetCopy(TargetType target, void* dst, const void* src, size_t size); @@ -117,6 +124,9 @@ class Buffer { data_ = TargetMalloc(target, size); target_ = target; space_ = size; +#ifdef LITE_WITH_OPENCL + cl_use_image2d_ = false; +#endif } } @@ -128,15 +138,15 @@ class Buffer { const size_t img_w, const size_t img_h, void* host_ptr = nullptr) { - size_t size = sizeof(T) * img_w * img_h * - 4; // 4 for RGBA, un-used for opencl Image2D if (target != target_ || cl_image2d_width_ < img_w || - cl_image2d_height_ < img_h) { + cl_image2d_height_ < img_h || host_ptr != nullptr) { CHECK_EQ(own_data_, true) << "Can not reset unowned buffer."; Free(); data_ = TargetWrapperCL::MallocImage(img_w, img_h, host_ptr); target_ = target; - space_ = size; // un-used for opencl Image2D + space_ = sizeof(T) * img_w * img_h * + 4; // un-used for opencl Image2D, 4 for RGBA, + cl_use_image2d_ = true; cl_image2d_width_ = img_w; cl_image2d_height_ = img_h; } @@ -145,7 +155,11 @@ class Buffer { void Free() { if (space_ > 0 && own_data_) { - TargetFree(target_, data_); + if (!cl_use_image2d_) { + TargetFree(target_, data_); + } else { + TargetFree(target_, data_, "cl_use_image2d_"); + } } data_ = nullptr; target_ = TargetType::kHost; @@ -164,6 +178,7 @@ class Buffer { private: // memory it actually malloced. size_t space_{0}; + bool cl_use_image2d_{false}; // only used for OpenCL Image2D size_t cl_image2d_width_{0}; // only used for OpenCL Image2D size_t cl_image2d_height_{0}; // only used for OpenCL Image2D void* data_{nullptr}; diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index 8ee2a3fcd94a527b43836b69f731577dabee6ed3..d036bf7988b98e64586e42683d33b4696e9ff706 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -21,6 +21,8 @@ lite_cc_library(mir_passes fusion/elementwise_add_activation_fuse_pass.cc fusion/quant_dequant_fuse_pass.cc fusion/sequence_pool_concat_fuse_pass.cc + fusion/__xpu__resnet_fuse_pass.cc + fusion/__xpu__multi_encoder_fuse_pass.cc elimination/identity_scale_eliminate_pass.cc elimination/elementwise_mul_constant_eliminate_pass.cc static_kernel_pick_pass.cc @@ -35,8 +37,8 @@ lite_cc_library(mir_passes demo_pass.cc runtime_context_assign_pass.cc memory_optimize_pass.cc + multi_stream_analysis_pass.cc mlu_postprocess_pass.cc - subgraph_cast_display_pass.cc weight_quantization_preprocess_pass.cc quantized_op_attributes_inference_pass.cc DEPS mir_pass types context ${mir_fusers} ${mir_subgraphs}) @@ -71,10 +73,10 @@ set(pattern_deps mir_node mir_ssa_graph op) if (WITH_TESTING) list(APPEND pattern_deps gtest) endif() -lite_cc_library(pattern_matcher SRCS pattern_matcher.cc DEPS ${pattern_deps}) +lite_cc_library(pattern_matcher SRCS pattern_matcher.cc xpu_pattern_matcher.cc DEPS ${pattern_deps}) lite_cc_test(test_pattern_matcher SRCS pattern_matcher_test.cc DEPS pattern_matcher) -lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher) +lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc xpu_pattern_matcher_high_api.cc DEPS pattern_matcher) # for mobile, unnecessary to compile the following testings. diff --git a/lite/core/mir/dot.h b/lite/core/mir/dot.h index df70565c0775acdb61cb540598f15b7f84e0119c..a68890910ab33bd32c68efc6f06236db21909a05 100644 --- a/lite/core/mir/dot.h +++ b/lite/core/mir/dot.h @@ -27,8 +27,8 @@ #include "lite/utils/string.h" namespace paddle { -namespace inference { -namespace analysis { +namespace lite { +namespace mir { static size_t dot_node_counter{0}; @@ -162,6 +162,6 @@ class Dot { std::vector attrs_; }; -} // namespace analysis -} // namespace inference +} // namespace mir +} // namespace lite } // namespace paddle diff --git a/lite/core/mir/fusion/CMakeLists.txt b/lite/core/mir/fusion/CMakeLists.txt index e65e72cf7b367ee8477f3f783ae4d82372529864..04a36976c7110c64ef781af12fc86fd4853fe583 100644 --- a/lite/core/mir/fusion/CMakeLists.txt +++ b/lite/core/mir/fusion/CMakeLists.txt @@ -27,10 +27,10 @@ lite_cc_library(fuse_transpose_softmax_transpose DEPS pattern_matcher_high_api) lite_cc_library(fuse_interpolate SRCS interpolate_fuser.cc - DEPS pattern_matcher_high_api) + DEPS pattern_matcher_high_api) lite_cc_library(fuse_sequence_pool_concat SRCS sequence_pool_concat_fuser.cc - DEPS pattern_matcher_high_api) + DEPS pattern_matcher_high_api) set(mir_fusers fuse_fc diff --git a/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..655274070f1ffcccf39b5f3ff6aaa705c5cbbfda --- /dev/null +++ b/lite/core/mir/fusion/__xpu__multi_encoder_fuse_pass.cc @@ -0,0 +1,637 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { + +namespace fusion { + +class XPUSingleEncoderFuser : public FuseBase { + public: + explicit XPUSingleEncoderFuser(const std::string& act_type = "gelu") + : act_type_(act_type) {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("mul", "X") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + + auto* q_mul_y = + VarNode("q_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* q_mul = OpNode("q_mul", "mul"); + auto* q_mul_out = VarNode("q_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* q_add_y = VarNode("q_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* q_add = OpNode("q_add", "elementwise_add")->AsIntermediate(); + auto* q_add_out = VarNode("q_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* q_reshape2 = OpNode("q_reshape2", "reshape2")->AsIntermediate(); + auto* q_reshape2_out = VarNode("q_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* q_reshape2_xshape = VarNode("q_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* q_transpose2 = OpNode("q_transpose2", "transpose2")->AsIntermediate(); + auto* q_transpose2_out = VarNode("q_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("scale", "X") + ->AsIntermediate(); + auto* q_transpose2_xshape = + VarNode("q_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + auto* q_scale = OpNode("q_scale", "scale")->AsIntermediate(); + auto* q_scale_out = VarNode("q_scale_out") + ->assert_is_op_output("scale", "Out") + ->assert_is_op_input("matmul", "X") + ->AsIntermediate(); + + auto* k_mul_y = + VarNode("k_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* k_mul = OpNode("k_mul", "mul")->AsIntermediate(); + auto* k_mul_out = VarNode("k_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* k_add_y = VarNode("k_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* k_add = OpNode("k_add", "elementwise_add")->AsIntermediate(); + auto* k_add_out = VarNode("k_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* k_reshape2 = OpNode("k_reshape2", "reshape2")->AsIntermediate(); + auto* k_reshape2_out = VarNode("k_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* k_reshape2_xshape = VarNode("k_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* k_transpose2 = OpNode("k_transpose2", "transpose2")->AsIntermediate(); + auto* k_transpose2_out = VarNode("k_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("matmul", "Y") + ->AsIntermediate(); + auto* k_transpose2_xshape = + VarNode("k_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + + auto* qk_matmul = OpNode("qk_matmul", "matmul")->AsIntermediate(); + auto* qk_matmul_out = VarNode("qk_matmul_out") + ->assert_is_op_output("matmul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qk_mask = VarNode("qk_mask") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qk_add = OpNode("qk_add", "elementwise_add")->AsIntermediate(); + auto* qk_add_out = VarNode("qk_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("softmax", "X") + ->AsIntermediate(); + auto* qk_softmax = OpNode("qk_softmax", "softmax")->AsIntermediate(); + auto* qk_softmax_out = VarNode("qk_softmax_out") + ->assert_is_op_output("softmax", "Out") + ->AsIntermediate(); + auto* qk_dropout = OpNode("qk_dropout", "dropout")->AsIntermediate(); + auto* qk_dropout_out = VarNode("qk_dropout_out") + ->assert_is_op_output("dropout", "Out") + ->assert_is_op_input("matmul", "X") + ->AsIntermediate(); + auto* qk_dropout_mask = VarNode("qk_dropout_mask") + ->assert_is_op_output("dropout", "Mask") + ->AsIntermediate(); + + auto* v_mul_y = + VarNode("v_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* v_mul = OpNode("v_mul", "mul")->AsIntermediate(); + auto* v_mul_out = VarNode("v_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* v_add_y = VarNode("v_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* v_add = OpNode("v_add", "elementwise_add")->AsIntermediate(); + auto* v_add_out = VarNode("v_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* v_reshape2 = OpNode("v_reshape2", "reshape2")->AsIntermediate(); + auto* v_reshape2_out = VarNode("v_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* v_reshape2_xshape = VarNode("v_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* v_transpose2 = OpNode("v_transpose2", "transpose2")->AsIntermediate(); + auto* v_transpose2_out = VarNode("v_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("matmul", "Y") + ->AsIntermediate(); + auto* v_transpose2_xshape = + VarNode("v_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + + auto* qkv_matmul = OpNode("qkv_matmul", "matmul")->AsIntermediate(); + auto* qkv_matmul_out = VarNode("qkv_matmul_out") + ->assert_is_op_output("matmul", "Out") + ->assert_is_op_input("transpose2", "X") + ->AsIntermediate(); + auto* qkv_transpose2 = + OpNode("qkv_transpose2", "transpose2")->AsIntermediate(); + auto* qkv_transpose2_out = VarNode("qkv_transpose2_out") + ->assert_is_op_output("transpose2", "Out") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate(); + auto* qkv_transpose2_xshape = + VarNode("qkv_transpose2_xshape") + ->assert_is_op_output("transpose2", "XShape") + ->AsIntermediate(); + auto* qkv_reshape2 = OpNode("qkv_reshape2", "reshape2")->AsIntermediate(); + auto* qkv_reshape2_out = VarNode("qkv_reshape2_out") + ->assert_is_op_output("reshape2", "Out") + ->assert_is_op_input("mul", "X") + ->AsIntermediate(); + auto* qkv_reshape2_xshape = VarNode("qkv_reshape2_xshape") + ->assert_is_op_output("reshape2", "XShape") + ->AsIntermediate(); + auto* qkv_mul_y = + VarNode("qkv_mul_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul = OpNode("qkv_mul", "mul")->AsIntermediate(); + auto* qkv_mul_out = VarNode("qkv_mul_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_y = VarNode("qkv_add_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add = OpNode("qkv_add", "elementwise_add")->AsIntermediate(); + auto* qkv_add_out = VarNode("qkv_add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("dropout", "X") + ->AsIntermediate(); + auto* qkv_dropout = OpNode("qkv_dropout", "dropout")->AsIntermediate(); + auto* qkv_dropout_out = VarNode("qkv_dropout_out") + ->assert_is_op_output("dropout", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_dropout_mask = VarNode("qkv_dropout_mask") + ->assert_is_op_output("dropout", "Mask") + ->AsIntermediate(); + + auto* qkv_add_2 = OpNode("qkv_add_2", "elementwise_add")->AsIntermediate(); + auto* qkv_add_2_out = VarNode("qkv_add_2_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("layer_norm", "X") + ->AsIntermediate(); + auto* qkv_ln_2_scale = VarNode("qkv_ln_2_scale") + ->assert_is_op_input("layer_norm", "Scale") + ->AsInput(); + auto* qkv_ln_2_bias = VarNode("qkv_ln_2_bias") + ->assert_is_op_input("layer_norm", "Bias") + ->AsInput(); + auto* qkv_ln_2 = OpNode("qkv_ln_2", "layer_norm")->AsIntermediate(); + auto* qkv_ln_2_out = VarNode("qkv_ln_2_out") + ->assert_is_op_output("layer_norm", "Y") + ->assert_is_op_input("mul", "X") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* qkv_ln_2_mean = VarNode("qkv_ln_2_mean") + ->assert_is_op_output("layer_norm", "Mean") + ->AsIntermediate(); + auto* qkv_ln_2_var = VarNode("qkv_ln_2_var") + ->assert_is_op_output("layer_norm", "Variance") + ->AsIntermediate(); + + auto* qkv_mul_3_y = + VarNode("qkv_mul_3_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul_3 = OpNode("qkv_mul_3", "mul")->AsIntermediate(); + auto* qkv_mul_3_out = VarNode("qkv_mul_3_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_3_y = VarNode("qkv_add_3_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add_3 = OpNode("qkv_add_3", "elementwise_add")->AsIntermediate(); + auto* qkv_add_3_out = VarNode("qkv_add_3_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input(act_type_, "X") + ->AsIntermediate(); + auto* qkv_act = OpNode("qkv_act", act_type_)->AsIntermediate(); + auto* qkv_act_out = VarNode("qkv_act_out") + ->assert_is_op_output(act_type_, "Out") + ->assert_is_op_input("mul", "X") + ->AsIntermediate(); + auto* qkv_mul_4_y = + VarNode("qkv_mul_4_y")->assert_is_op_input("mul", "Y")->AsInput(); + auto* qkv_mul_4 = OpNode("qkv_mul_4", "mul")->AsIntermediate(); + auto* qkv_mul_4_out = VarNode("qkv_mul_4_out") + ->assert_is_op_output("mul", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_add_4_y = VarNode("qkv_add_4_y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto* qkv_add_4 = OpNode("qkv_add_4", "elementwise_add")->AsIntermediate(); + auto* qkv_add_4_out = VarNode("qkv_add_4_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("dropout", "X") + ->AsIntermediate(); + auto* qkv_dropout_4 = OpNode("qkv_dropout_4", "dropout")->AsIntermediate(); + auto* qkv_dropout_4_out = VarNode("qkv_dropout_4_out") + ->assert_is_op_output("dropout", "Out") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* qkv_dropout_4_mask = VarNode("qkv_dropout_4_mask") + ->assert_is_op_output("dropout", "Mask") + ->AsIntermediate(); + + auto* qkv_add_5 = OpNode("qkv_add_5", "elementwise_add")->AsIntermediate(); + auto* qkv_add_5_out = VarNode("qkv_add_5_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("layer_norm", "X") + ->AsIntermediate(); + auto* qkv_ln_5_scale = VarNode("qkv_ln_5_scale") + ->assert_is_op_input("layer_norm", "Scale") + ->AsInput(); + auto* qkv_ln_5_bias = VarNode("qkv_ln_5_bias") + ->assert_is_op_input("layer_norm", "Bias") + ->AsInput(); + auto* qkv_ln_5 = OpNode("qkv_ln_5", "layer_norm")->AsIntermediate(); + auto* qkv_ln_5_out = VarNode("qkv_ln_5_out") + ->assert_is_op_output("layer_norm", "Y") + ->AsOutput(); + auto* qkv_ln_5_mean = VarNode("qkv_ln_5_mean") + ->assert_is_op_output("layer_norm", "Mean") + ->AsIntermediate(); + auto* qkv_ln_5_var = VarNode("qkv_ln_5_var") + ->assert_is_op_output("layer_norm", "Variance") + ->AsIntermediate(); + + // TODO(miaotianxiang): use LinksFrom/LinksTo() instead + *input >> *q_mul >> *q_mul_out >> *q_add >> *q_add_out >> *q_reshape2 >> + *q_reshape2_out >> *q_transpose2 >> *q_transpose2_out >> *q_scale >> + *q_scale_out >> *qk_matmul; + *q_mul_y >> *q_mul; + *q_add_y >> *q_add; + *q_reshape2 >> *q_reshape2_xshape; + *q_transpose2 >> *q_transpose2_xshape; + + *input >> *k_mul >> *k_mul_out >> *k_add >> *k_add_out >> *k_reshape2 >> + *k_reshape2_out >> *k_transpose2 >> *k_transpose2_out >> *qk_matmul; + *k_mul_y >> *k_mul; + *k_add_y >> *k_add; + *k_reshape2 >> *k_reshape2_xshape; + *k_transpose2 >> *k_transpose2_xshape; + + *qk_matmul >> *qk_matmul_out >> *qk_add >> *qk_add_out >> *qk_softmax >> + *qk_softmax_out >> *qk_dropout >> *qk_dropout_out >> *qkv_matmul; + *qk_mask >> *qk_add; + *qk_dropout >> *qk_dropout_mask; + + *input >> *v_mul >> *v_mul_out >> *v_add >> *v_add_out >> *v_reshape2 >> + *v_reshape2_out >> *v_transpose2 >> *v_transpose2_out >> *qkv_matmul; + *v_mul_y >> *v_mul; + *v_add_y >> *v_add; + *v_reshape2 >> *v_reshape2_xshape; + *v_transpose2 >> *v_transpose2_xshape; + + *qkv_matmul >> *qkv_matmul_out >> *qkv_transpose2 >> *qkv_transpose2_out >> + *qkv_reshape2 >> *qkv_reshape2_out >> *qkv_mul >> *qkv_mul_out >> + *qkv_add >> *qkv_add_out >> *qkv_dropout >> *qkv_dropout_out >> + *qkv_add_2; + *qkv_transpose2 >> *qkv_transpose2_xshape; + *qkv_reshape2 >> *qkv_reshape2_xshape; + *qkv_mul_y >> *qkv_mul; + *qkv_add_y >> *qkv_add; + *qkv_dropout >> *qkv_dropout_mask; + + *input >> *qkv_add_2 >> *qkv_add_2_out >> *qkv_ln_2 >> *qkv_ln_2_out; + *qkv_ln_2_scale >> *qkv_ln_2; + *qkv_ln_2_bias >> *qkv_ln_2; + *qkv_ln_2 >> *qkv_ln_2_mean; + *qkv_ln_2 >> *qkv_ln_2_var; + + *qkv_ln_2_out >> *qkv_mul_3 >> *qkv_mul_3_out >> *qkv_add_3 >> + *qkv_add_3_out >> *qkv_act >> *qkv_act_out >> *qkv_mul_4 >> + *qkv_mul_4_out >> *qkv_add_4 >> *qkv_add_4_out >> *qkv_dropout_4 >> + *qkv_dropout_4_out >> *qkv_add_5; + *qkv_mul_3_y >> *qkv_mul_3; + *qkv_add_3_y >> *qkv_add_3; + *qkv_mul_4_y >> *qkv_mul_4; + *qkv_add_4_y >> *qkv_add_4; + *qkv_dropout_4 >> *qkv_dropout_4_mask; + + *qkv_ln_2_out >> *qkv_add_5 >> *qkv_add_5_out >> *qkv_ln_5 >> *qkv_ln_5_out; + *qkv_ln_5_scale >> *qkv_ln_5; + *qkv_ln_5_bias >> *qkv_ln_5; + *qkv_ln_5 >> *qkv_ln_5_mean; + *qkv_ln_5 >> *qkv_ln_5_var; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("single_encoder"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Mask", {matched.at("qk_mask")->arg()->name}); + op_desc.SetInput("FCWeight", + { + matched.at("q_mul_y")->arg()->name, + matched.at("k_mul_y")->arg()->name, + matched.at("v_mul_y")->arg()->name, + matched.at("qkv_mul_y")->arg()->name, + matched.at("qkv_mul_3_y")->arg()->name, + matched.at("qkv_mul_4_y")->arg()->name, + }); + op_desc.SetInput("FCBias", + { + matched.at("q_add_y")->arg()->name, + matched.at("k_add_y")->arg()->name, + matched.at("v_add_y")->arg()->name, + matched.at("qkv_add_y")->arg()->name, + matched.at("qkv_add_3_y")->arg()->name, + matched.at("qkv_add_4_y")->arg()->name, + }); + op_desc.SetInput("LNScale", + { + matched.at("qkv_ln_2_scale")->arg()->name, + matched.at("qkv_ln_5_scale")->arg()->name, + }); + op_desc.SetInput("LNBias", + { + matched.at("qkv_ln_2_bias")->arg()->name, + matched.at("qkv_ln_5_bias")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("qkv_ln_5_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + // extra traits to distill + auto* reshape_op_info = matched.at("q_reshape2")->stmt()->op_info(); + auto reshape_dim = reshape_op_info->GetAttr>("shape"); + op_desc.SetAttr("head_num", reshape_dim[2]); + op_desc.SetAttr("size_per_head", reshape_dim[3]); + op_desc.SetAttr("act_type", act_type_); + + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + auto* single_encoder_stmt = matched.at("q_mul")->stmt(); + fake_subgraph_op->Attach(op_desc, single_encoder_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(single_encoder_stmt->op()->valid_places()); + single_encoder_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "qk_mask", + "k_mul_y", + "v_mul_y", + "qkv_mul_y", + "qkv_mul_3_y", + "qkv_mul_4_y", + "q_add_y", + "k_add_y", + "v_add_y", + "qkv_add_y", + "qkv_add_3_y", + "qkv_add_4_y", + "qkv_ln_2_scale", + "qkv_ln_2_bias", + "qkv_ln_5_scale", + "qkv_ln_5_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("q_mul")); + } + IR_OP_VAR_LINK(matched.at("q_mul"), matched.at("qkv_ln_5_out")); + } + + private: + std::string act_type_; +}; + +class XPUMultiEncoderFuser { + public: + bool IsDirectPredecessorOf(Node* op1, Node* op2) { + for (auto* out : op1->outlinks) { + for (auto* in : op2->inlinks) { + if (out == in) return true; + } + } + return false; + } + + void operator()(SSAGraph* graph) { + std::vector all_encoders; + for (auto* node : graph->StmtTopologicalOrder()) { + CHECK(node->IsStmt()); + if (node->stmt()->op_info()->Type() == "single_encoder") { + all_encoders.push_back(node); + } + } + VLOG(3) << "Found " << all_encoders.size() << " single_encoder"; + if (all_encoders.size() == 0) { + return; + } + + // TODO(miaotianxiang): more verification + for (size_t i = 0; i < all_encoders.size() - 1; ++i) { + CHECK(IsDirectPredecessorOf(all_encoders[i], all_encoders[i + 1])); + } + std::string mask_name; + for (auto* encoder : all_encoders) { + auto* op_info = encoder->stmt()->op_info(); + if (mask_name.empty()) { + mask_name = op_info->Input("Mask").front(); + } else { + // CHECK(mask_name == op_info->Input("Mask").front()); + } + } + + std::unordered_set to_remove; + Node* first_encoder = all_encoders[0]; + std::string in_name, out_name; + std::vector arg_names{ + "FCWeight", "FCBias", "LNScale", "LNBias"}; + std::unordered_map> arg_map; + for (size_t i = 0; i < all_encoders.size(); ++i) { + Node* cur_encoder = all_encoders[i]; + auto* op_info = cur_encoder->stmt()->op_info(); + for (auto arg_name : arg_names) { + auto real_names = op_info->Input(arg_name); + for (auto name : real_names) { + auto* arg_node = graph->RetrieveArgument(name); + DirectedLink(arg_node, first_encoder); + arg_map[arg_name].push_back(name); + } + } + + auto* cur_out = + graph->RetrieveArgument(op_info->Output("Outputs").front()); + if (i == 0) { + // first encoder + to_remove.insert(cur_out); + in_name = op_info->Input("Inputs").front(); + mask_name = op_info->Input("Mask").front(); + } else if (i == all_encoders.size() - 1) { + // last encoder + to_remove.insert(cur_encoder); + DirectedLink(first_encoder, cur_out); + out_name = op_info->Output("Outputs").front(); + } else { + to_remove.insert(cur_encoder); + to_remove.insert(cur_out); + } + } + GraphSafeRemoveNodes(graph, to_remove); + + auto* multi_encoder_stmt = first_encoder->stmt(); + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__multi_encoder"); + op_desc.SetInput("Input", {in_name}); + for (auto kv : arg_map) { + op_desc.SetInput(kv.first, kv.second); + } + op_desc.SetInput("Mask", {mask_name}); + op_desc.SetOutput("Output", {out_name}); + op_desc.SetAttr("xpu", 1); + auto* first_encoder_op_info = multi_encoder_stmt->op_info(); + op_desc.SetAttr("head_num", + first_encoder_op_info->GetAttr("head_num")); + op_desc.SetAttr("size_per_head", + first_encoder_op_info->GetAttr("size_per_head")); + op_desc.SetAttr("n_layers", all_encoders.size()); + op_desc.SetAttr( + "act_type", first_encoder_op_info->GetAttr("act_type")); + + auto* scope = multi_encoder_stmt->op()->scope(); + std::vector fc_weight_max(arg_map["FCWeight"].size()); + auto& fc_weight_names = arg_map["FCWeight"]; + for (size_t i = 0; i < fc_weight_names.size(); ++i) { + auto* weight_t = scope->FindMutableTensor(fc_weight_names[i]); + auto weight_dims = weight_t->dims(); + int weight_len = weight_t->numel(); + float* weight_on_host = weight_t->mutable_data(); + float max_f = + paddle::lite::xpu::math::FindMaxAbs(weight_on_host, weight_len); + + std::unique_ptr weight_int16(new int16_t[weight_len]); + std::unique_ptr weight_trans_int16(new int16_t[weight_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + weight_on_host, weight_int16.get(), max_f, weight_len); + paddle::lite::xpu::math::Transpose(weight_int16.get(), + weight_trans_int16.get(), + weight_dims[0], + weight_dims[1]); + memcpy(weight_on_host, + weight_trans_int16.get(), + weight_len * sizeof(int16_t)); + fc_weight_max[i] = max_f; + } + + std::string max_name = "encoder_max"; + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, first_encoder); + auto* max_filter_tensor = scope->NewTensor(max_name); + max_filter_tensor->Resize({static_cast(fc_weight_max.size())}); + memcpy(max_filter_tensor->mutable_data(), + &fc_weight_max[0], + sizeof(float) * fc_weight_max.size()); + op_desc.SetInput("FCWeightMax", {max_name}); + + auto multi_encoder_op = LiteOpRegistry::Global().Create(op_desc.Type()); + multi_encoder_op->Attach(op_desc, scope); + multi_encoder_op->SetValidPlaces(multi_encoder_stmt->op()->valid_places()); + auto kernels = + multi_encoder_op->CreateKernels(multi_encoder_op->valid_places()); + multi_encoder_stmt->SetOp(multi_encoder_op); + multi_encoder_stmt->SetKernels(std::move(kernels)); + + // temp remove useless cast + std::unordered_set to_remove2; + Node* stack = nullptr; + for (auto* node : graph->StmtTopologicalOrder()) { + CHECK(node->IsStmt()); + if (node->stmt()->op_info()->Type() == "stack") { + stack = node; + } + } + Node* stack_out = stack->outlinks.front(); + for (Node* cast : stack_out->outlinks) { + Node* cast_out = cast->outlinks.front(); + if (cast_out->outlinks.size() == 0) { + // remove + to_remove2.insert(cast_out); + to_remove2.insert(cast); + } + } + GraphSafeRemoveNodes(graph, to_remove2); + } +}; + +} // namespace fusion + +class XPUMultiEncoderFusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + // TODO(miaotianxiang): backup graph, recover from failed match + std::vector act_types{"gelu", "relu"}; + for (auto& act_type : act_types) { + fusion::XPUSingleEncoderFuser single_encoder_fuser(act_type); + single_encoder_fuser(graph.get()); + fusion::XPUMultiEncoderFuser multi_encoder_fuser; + multi_encoder_fuser(graph.get()); + } + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__multi_encoder_fuse_pass, + paddle::lite::mir::XPUMultiEncoderFusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("matmul"); diff --git a/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..de2210a76ea0647cb02131a088ceb754afd0ef9c --- /dev/null +++ b/lite/core/mir/fusion/__xpu__resnet_fuse_pass.cc @@ -0,0 +1,951 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include "lite/backends/xpu/math.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include "lite/operators/subgraph_op.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace fusion { + +class XPUResNetBlock0Fuser : public FuseBase { + public: + XPUResNetBlock0Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* left_conv1_weight = VarNode("left_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv1 = OpNode("left_conv1", "conv2d"); + auto* left_conv1_out = VarNode("left_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn1_scale = VarNode("left_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn1_bias = VarNode("left_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn1_mean = VarNode("left_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn1_var = VarNode("left_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn1 = OpNode("left_bn1", "batch_norm")->AsIntermediate(); + auto* left_bn1_out = VarNode("left_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn1_mean_out = VarNode("left_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn1_var_out = + VarNode("left_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn1_saved_mean = + VarNode("left_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn1_saved_var = + VarNode("left_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu1 = OpNode("left_relu1", "relu")->AsIntermediate(); + auto* left_relu1_out = VarNode("left_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv2_weight = VarNode("left_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv2 = OpNode("left_conv2", "conv2d")->AsIntermediate(); + auto* left_conv2_out = VarNode("left_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn2_scale = VarNode("left_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn2_bias = VarNode("left_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn2_mean = VarNode("left_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn2_var = VarNode("left_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn2 = OpNode("left_bn2", "batch_norm")->AsIntermediate(); + auto* left_bn2_out = VarNode("left_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* left_bn2_mean_out = VarNode("left_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn2_var_out = + VarNode("left_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn2_saved_mean = + VarNode("left_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn2_saved_var = + VarNode("left_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* left_relu2 = OpNode("left_relu2", "relu")->AsIntermediate(); + auto* left_relu2_out = VarNode("left_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* left_conv3_weight = VarNode("left_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* left_conv3 = OpNode("left_conv3", "conv2d")->AsIntermediate(); + auto* left_conv3_out = VarNode("left_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* left_bn3_scale = VarNode("left_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* left_bn3_bias = VarNode("left_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* left_bn3_mean = VarNode("left_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* left_bn3_var = VarNode("left_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* left_bn3 = OpNode("left_bn3", "batch_norm")->AsIntermediate(); + auto* left_bn3_out = VarNode("left_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* left_bn3_mean_out = VarNode("left_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* left_bn3_var_out = + VarNode("left_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* left_bn3_saved_mean = + VarNode("left_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* left_bn3_saved_var = + VarNode("left_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d")->AsIntermediate(); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *left_conv1 >> *left_conv1_out >> *left_bn1 >> *left_bn1_out >> + *left_relu1 >> *left_relu1_out >> *left_conv2 >> *left_conv2_out >> + *left_bn2 >> *left_bn2_out >> *left_relu2 >> *left_relu2_out >> + *left_conv3 >> *left_conv3_out >> *left_bn3 >> *left_bn3_out >> *add; + + *left_conv1_weight >> *left_conv1; + *left_bn1_scale >> *left_bn1; + *left_bn1_bias >> *left_bn1; + *left_bn1_mean >> *left_bn1; + *left_bn1_var >> *left_bn1; + *left_bn1 >> *left_bn1_mean_out; + *left_bn1 >> *left_bn1_var_out; + *left_bn1 >> *left_bn1_saved_mean; + *left_bn1 >> *left_bn1_saved_var; + + *left_conv2_weight >> *left_conv2; + *left_bn2_scale >> *left_bn2; + *left_bn2_bias >> *left_bn2; + *left_bn2_mean >> *left_bn2; + *left_bn2_var >> *left_bn2; + *left_bn2 >> *left_bn2_mean_out; + *left_bn2 >> *left_bn2_var_out; + *left_bn2 >> *left_bn2_saved_mean; + *left_bn2 >> *left_bn2_saved_var; + + *left_conv3_weight >> *left_conv3; + *left_bn3_scale >> *left_bn3; + *left_bn3_bias >> *left_bn3; + *left_bn3_mean >> *left_bn3; + *left_bn3_var >> *left_bn3; + *left_bn3 >> *left_bn3_mean_out; + *left_bn3 >> *left_bn3_var_out; + *left_bn3 >> *left_bn3_saved_mean; + *left_bn3 >> *left_bn3_saved_var; + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_block0"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("left_conv1_weight")->arg()->name, + matched.at("left_conv2_weight")->arg()->name, + matched.at("left_conv3_weight")->arg()->name, + matched.at("right_conv1_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("left_bn1_scale")->arg()->name, + matched.at("left_bn2_scale")->arg()->name, + matched.at("left_bn3_scale")->arg()->name, + matched.at("right_bn1_scale")->arg()->name, + }); + op_desc.SetInput("Bias", + { + matched.at("left_bn1_bias")->arg()->name, + matched.at("left_bn2_bias")->arg()->name, + matched.at("left_bn3_bias")->arg()->name, + matched.at("right_bn1_bias")->arg()->name, + }); + op_desc.SetInput("Mean", + { + matched.at("left_bn1_mean")->arg()->name, + matched.at("left_bn2_mean")->arg()->name, + matched.at("left_bn3_mean")->arg()->name, + matched.at("right_bn1_mean")->arg()->name, + }); + op_desc.SetInput("Var", + { + matched.at("left_bn1_variance")->arg()->name, + matched.at("left_bn2_variance")->arg()->name, + matched.at("left_bn3_variance")->arg()->name, + matched.at("right_bn1_variance")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block0_stmt = matched.at("left_conv1")->stmt(); + // block0_stmt->ResetOp(op_desc, graph->valid_places()); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, block0_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block0_stmt->op()->valid_places()); + block0_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "left_conv2_weight", + "left_conv3_weight", + "right_conv1_weight", + "left_bn1_bias", + "left_bn2_bias", + "left_bn3_bias", + "right_bn1_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("left_conv1")); + } + IR_OP_VAR_LINK(matched.at("left_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNetBlock1Fuser : public FuseBase { + public: + XPUResNetBlock1Fuser() {} + + void BuildPattern() override { + auto* input = VarNode("input") + ->assert_is_op_input("conv2d", "Input") + ->assert_is_op_input("elementwise_add", "X") + ->AsInput(); + + auto* right_conv1_weight = VarNode("right_conv1_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv1 = OpNode("right_conv1", "conv2d"); + auto* right_conv1_out = VarNode("right_conv1_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn1_scale = VarNode("right_bn1_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn1_bias = VarNode("right_bn1_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn1_mean = VarNode("right_bn1_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn1_var = VarNode("right_bn1_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn1 = OpNode("right_bn1", "batch_norm")->AsIntermediate(); + auto* right_bn1_out = VarNode("right_bn1_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn1_mean_out = + VarNode("right_bn1_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn1_var_out = + VarNode("right_bn1_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn1_saved_mean = + VarNode("right_bn1_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn1_saved_var = + VarNode("right_bn1_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu1 = OpNode("right_relu1", "relu")->AsIntermediate(); + auto* right_relu1_out = VarNode("right_relu1_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv2_weight = VarNode("right_conv2_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv2 = OpNode("right_conv2", "conv2d")->AsIntermediate(); + auto* right_conv2_out = VarNode("right_conv2_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn2_scale = VarNode("right_bn2_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn2_bias = VarNode("right_bn2_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn2_mean = VarNode("right_bn2_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn2_var = VarNode("right_bn2_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn2 = OpNode("right_bn2", "batch_norm")->AsIntermediate(); + auto* right_bn2_out = VarNode("right_bn2_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* right_bn2_mean_out = + VarNode("right_bn2_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn2_var_out = + VarNode("right_bn2_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn2_saved_mean = + VarNode("right_bn2_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn2_saved_var = + VarNode("right_bn2_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* right_relu2 = OpNode("right_relu2", "relu")->AsIntermediate(); + auto* right_relu2_out = VarNode("right_relu2_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("conv2d", "Input") + ->AsIntermediate(); + + auto* right_conv3_weight = VarNode("right_conv3_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* right_conv3 = OpNode("right_conv3", "conv2d")->AsIntermediate(); + auto* right_conv3_out = VarNode("right_conv3_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* right_bn3_scale = VarNode("right_bn3_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* right_bn3_bias = VarNode("right_bn3_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* right_bn3_mean = VarNode("right_bn3_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* right_bn3_var = VarNode("right_bn3_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* right_bn3 = OpNode("right_bn3", "batch_norm")->AsIntermediate(); + auto* right_bn3_out = VarNode("right_bn3_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("elementwise_add", "Y") + ->AsIntermediate(); + auto* right_bn3_mean_out = + VarNode("right_bn3_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* right_bn3_var_out = + VarNode("right_bn3_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* right_bn3_saved_mean = + VarNode("right_bn3_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* right_bn3_saved_var = + VarNode("right_bn3_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + + auto* add = OpNode("add", "elementwise_add")->AsIntermediate(); + auto* add_out = VarNode("add_out") + ->assert_is_op_output("elementwise_add", "Out") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* relu = OpNode("relu", "relu")->AsIntermediate(); + auto* relu_out = + VarNode("relu_out")->assert_is_op_output("relu", "Out")->AsOutput(); + + *input >> *right_conv1 >> *right_conv1_out >> *right_bn1 >> + *right_bn1_out >> *right_relu1 >> *right_relu1_out >> *right_conv2 >> + *right_conv2_out >> *right_bn2 >> *right_bn2_out >> *right_relu2 >> + *right_relu2_out >> *right_conv3 >> *right_conv3_out >> *right_bn3 >> + *right_bn3_out >> *add; + + *right_conv1_weight >> *right_conv1; + *right_bn1_scale >> *right_bn1; + *right_bn1_bias >> *right_bn1; + *right_bn1_mean >> *right_bn1; + *right_bn1_var >> *right_bn1; + *right_bn1 >> *right_bn1_mean_out; + *right_bn1 >> *right_bn1_var_out; + *right_bn1 >> *right_bn1_saved_mean; + *right_bn1 >> *right_bn1_saved_var; + + *right_conv2_weight >> *right_conv2; + *right_bn2_scale >> *right_bn2; + *right_bn2_bias >> *right_bn2; + *right_bn2_mean >> *right_bn2; + *right_bn2_var >> *right_bn2; + *right_bn2 >> *right_bn2_mean_out; + *right_bn2 >> *right_bn2_var_out; + *right_bn2 >> *right_bn2_saved_mean; + *right_bn2 >> *right_bn2_saved_var; + + *right_conv3_weight >> *right_conv3; + *right_bn3_scale >> *right_bn3; + *right_bn3_bias >> *right_bn3; + *right_bn3_mean >> *right_bn3; + *right_bn3_var >> *right_bn3; + *right_bn3 >> *right_bn3_mean_out; + *right_bn3 >> *right_bn3_var_out; + *right_bn3 >> *right_bn3_saved_mean; + *right_bn3 >> *right_bn3_saved_var; + + *input >> *add; + + *add >> *add_out >> *relu >> *relu_out; + } + + void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override { + cpp::OpDesc op_desc; + op_desc.SetType("resnet_block1"); + op_desc.SetInput("Inputs", {matched.at("input")->arg()->name}); + op_desc.SetInput("Filter", + { + matched.at("right_conv1_weight")->arg()->name, + matched.at("right_conv2_weight")->arg()->name, + matched.at("right_conv3_weight")->arg()->name, + }); + op_desc.SetInput("Scale", + { + matched.at("right_bn1_scale")->arg()->name, + matched.at("right_bn2_scale")->arg()->name, + matched.at("right_bn3_scale")->arg()->name, + }); + op_desc.SetInput("Bias", + { + matched.at("right_bn1_bias")->arg()->name, + matched.at("right_bn2_bias")->arg()->name, + matched.at("right_bn3_bias")->arg()->name, + }); + op_desc.SetInput("Mean", + { + matched.at("right_bn1_mean")->arg()->name, + matched.at("right_bn2_mean")->arg()->name, + matched.at("right_bn3_mean")->arg()->name, + }); + op_desc.SetInput("Var", + { + matched.at("right_bn1_variance")->arg()->name, + matched.at("right_bn2_variance")->arg()->name, + matched.at("right_bn3_variance")->arg()->name, + }); + op_desc.SetOutput("Outputs", {matched.at("relu_out")->arg()->name}); + // XXX: keep these to fool SubgraphOp::AttachImpl() + op_desc.SetAttr("sub_block", 0); + op_desc.SetAttr>("input_data_names", {}); + op_desc.SetAttr>("output_data_names", {}); + + auto block1_stmt = matched.at("right_conv1")->stmt(); + auto fake_subgraph_op = LiteOpRegistry::Global().Create("subgraph"); + // XXX: memleak? + auto sub_block_desc = new cpp::BlockDesc(); + static_cast(fake_subgraph_op.get()) + ->SetSubBlock(sub_block_desc); + fake_subgraph_op->Attach(op_desc, block1_stmt->op()->scope()); + fake_subgraph_op->SetValidPlaces(block1_stmt->op()->valid_places()); + block1_stmt->SetOp(fake_subgraph_op); + + std::vector froms = { + "right_conv2_weight", + "right_conv3_weight", + "right_bn1_bias", + "right_bn2_bias", + "right_bn3_bias", + }; + for (auto& from : froms) { + IR_NODE_LINK_TO(matched.at(from), matched.at("right_conv1")); + } + IR_OP_VAR_LINK(matched.at("right_conv1"), matched.at("relu_out")); + } +}; + +class XPUResNet50Fuser : public xpu::XPUFuseBase { + public: + XPUResNet50Fuser() {} + + void BuildPattern() override { + auto* input = + VarNode("input")->assert_is_op_input("conv2d", "Input")->AsInput(); + + auto* top_conv_weight = VarNode("top_conv_weight") + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto* top_conv = OpNode("top_conv", "conv2d"); + auto* top_conv_out = VarNode("top_conv_out") + ->assert_is_op_output("conv2d", "Output") + ->assert_is_op_input("batch_norm", "X") + ->AsIntermediate(); + auto* top_bn_scale = VarNode("top_bn_scale") + ->assert_is_op_input("batch_norm", "Scale") + ->AsIntermediate(); + auto* top_bn_bias = VarNode("top_bn_bias") + ->assert_is_op_input("batch_norm", "Bias") + ->AsInput(); + auto* top_bn_mean = VarNode("top_bn_mean") + ->assert_is_op_input("batch_norm", "Mean") + ->AsIntermediate(); + auto* top_bn_var = VarNode("top_bn_variance") + ->assert_is_op_input("batch_norm", "Variance") + ->AsIntermediate(); + auto* top_bn = OpNode("top_bn", "batch_norm")->AsIntermediate(); + auto* top_bn_out = VarNode("top_bn_out") + ->assert_is_op_output("batch_norm", "Y") + ->assert_is_op_input("relu", "X") + ->AsIntermediate(); + auto* top_bn_mean_out = VarNode("top_bn_mean_out") + ->assert_is_op_output("batch_norm", "MeanOut") + ->AsIntermediate(); + auto* top_bn_var_out = + VarNode("top_bn_var_out") + ->assert_is_op_output("batch_norm", "VarianceOut") + ->AsIntermediate(); + auto* top_bn_saved_mean = + VarNode("top_bn_saved_mean") + ->assert_is_op_output("batch_norm", "SavedMean") + ->AsIntermediate(); + auto* top_bn_saved_var = + VarNode("top_bn_saved_var") + ->assert_is_op_output("batch_norm", "SavedVariance") + ->AsIntermediate(); + auto* top_relu = OpNode("top_relu", "relu")->AsIntermediate(); + auto* top_relu_out = VarNode("top_relu_out") + ->assert_is_op_output("relu", "Out") + ->assert_is_op_input("pool2d", "X") + ->AsIntermediate(); + auto* top_pool = OpNode("top_pool", "pool2d")->AsIntermediate(); + auto* top_pool_out = VarNode("top_pool_out") + ->assert_is_op_output("pool2d", "Out") + ->assert_is_op_input("resnet_block0", "Inputs") + ->AsIntermediate(); + + // args are left out + auto* resnet_block0_1 = + OpNode("resnet_block0_1", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_1_out = + VarNode("resnet_block0_1_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_1 = + OpNode("resnet_block1_1_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_1_1_out = + VarNode("resnet_block1_1_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_1_2 = + OpNode("resnet_block1_1_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_1_2_out = + VarNode("resnet_block1_1_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_2 = + OpNode("resnet_block0_2", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_2_out = + VarNode("resnet_block0_2_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_1 = + OpNode("resnet_block1_2_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_1_out = + VarNode("resnet_block1_2_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_2 = + OpNode("resnet_block1_2_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_2_out = + VarNode("resnet_block1_2_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_2_3 = + OpNode("resnet_block1_2_3", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_2_3_out = + VarNode("resnet_block1_2_3_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_3 = + OpNode("resnet_block0_3", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_3_out = + VarNode("resnet_block0_3_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_1 = + OpNode("resnet_block1_3_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_1_out = + VarNode("resnet_block1_3_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_2 = + OpNode("resnet_block1_3_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_2_out = + VarNode("resnet_block1_3_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_3 = + OpNode("resnet_block1_3_3", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_3_out = + VarNode("resnet_block1_3_3_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_4 = + OpNode("resnet_block1_3_4", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_4_out = + VarNode("resnet_block1_3_4_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_3_5 = + OpNode("resnet_block1_3_5", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_3_5_out = + VarNode("resnet_block1_3_5_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* resnet_block0_4 = + OpNode("resnet_block0_4", "resnet_block0")->AsIntermediate(); + auto* resnet_block0_4_out = + VarNode("resnet_block0_4_out") + ->assert_is_op_output("resnet_block0", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_1 = + OpNode("resnet_block1_4_1", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_4_1_out = + VarNode("resnet_block1_4_1_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + auto* resnet_block1_4_2 = + OpNode("resnet_block1_4_2", "resnet_block1")->AsIntermediate(); + auto* resnet_block1_4_2_out = + VarNode("resnet_block1_4_2_out") + ->assert_is_op_output("resnet_block1", "Outputs") + ->AsIntermediate(); + + auto* bottom_pool = OpNode("bottom_pool", "pool2d")->AsIntermediate(); + auto* bottom_pool_out = VarNode("bottom_pool_out") + ->assert_is_op_output("pool2d", "Out") + ->AsOutput(); + + *input >> *top_conv >> *top_conv_out >> *top_bn >> *top_bn_out >> + *top_relu >> *top_relu_out >> *top_pool >> *top_pool_out >> + *resnet_block0_1 >> *resnet_block0_1_out >> *resnet_block1_1_1 >> + *resnet_block1_1_1_out >> *resnet_block1_1_2 >> + *resnet_block1_1_2_out >> *resnet_block0_2 >> *resnet_block0_2_out >> + *resnet_block1_2_1 >> *resnet_block1_2_1_out >> *resnet_block1_2_2 >> + *resnet_block1_2_2_out >> *resnet_block1_2_3 >> + *resnet_block1_2_3_out >> *resnet_block0_3 >> *resnet_block0_3_out >> + *resnet_block1_3_1 >> *resnet_block1_3_1_out >> *resnet_block1_3_2 >> + *resnet_block1_3_2_out >> *resnet_block1_3_3 >> + *resnet_block1_3_3_out >> *resnet_block1_3_4 >> + *resnet_block1_3_4_out >> *resnet_block1_3_5 >> + *resnet_block1_3_5_out >> *resnet_block0_4 >> *resnet_block0_4_out >> + *resnet_block1_4_1 >> *resnet_block1_4_1_out >> *resnet_block1_4_2 >> + *resnet_block1_4_2_out >> *bottom_pool >> *bottom_pool_out; + + *top_conv_weight >> *top_conv; + *top_bn_scale >> *top_bn; + *top_bn_bias >> *top_bn; + *top_bn_mean >> *top_bn; + *top_bn_var >> *top_bn; + *top_bn >> *top_bn_mean_out; + *top_bn >> *top_bn_var_out; + *top_bn >> *top_bn_saved_mean; + *top_bn >> *top_bn_saved_var; + } + + void InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched, + const std::vector& extra_input_vars) override { + cpp::OpDesc op_desc; + op_desc.SetType("__xpu__resnet50"); + op_desc.SetInput("Input", {matched.at("input")->arg()->name}); + std::vector filter_name = { + matched.at("top_conv_weight")->arg()->name}; + std::vector scale_name = { + matched.at("top_bn_scale")->arg()->name}; + std::vector bias_name = { + matched.at("top_bn_bias")->arg()->name}; + std::vector mean_name = { + matched.at("top_bn_mean")->arg()->name}; + std::vector var_name = { + matched.at("top_bn_variance")->arg()->name}; + std::vector max_filter_name; + std::vector resnet_block_vec = { + "resnet_block0_1", + "resnet_block1_1_1", + "resnet_block1_1_2", + "resnet_block0_2", + "resnet_block1_2_1", + "resnet_block1_2_2", + "resnet_block1_2_3", + "resnet_block0_3", + "resnet_block1_3_1", + "resnet_block1_3_2", + "resnet_block1_3_3", + "resnet_block1_3_4", + "resnet_block1_3_5", + "resnet_block0_4", + "resnet_block1_4_1", + "resnet_block1_4_2", + }; + for (auto& block : resnet_block_vec) { + auto* block_op_info = matched.at(block)->stmt()->op_info(); + auto block_filter_name = block_op_info->Input("Filter"); + std::copy(block_filter_name.begin(), + block_filter_name.end(), + std::back_inserter(filter_name)); + auto block_scale_name = block_op_info->Input("Scale"); + std::copy(block_scale_name.begin(), + block_scale_name.end(), + std::back_inserter(scale_name)); + auto block_bias_name = block_op_info->Input("Bias"); + std::copy(block_bias_name.begin(), + block_bias_name.end(), + std::back_inserter(bias_name)); + auto block_mean_name = block_op_info->Input("Mean"); + std::copy(block_mean_name.begin(), + block_mean_name.end(), + std::back_inserter(mean_name)); + auto block_var_name = block_op_info->Input("Var"); + std::copy(block_var_name.begin(), + block_var_name.end(), + std::back_inserter(var_name)); + } + op_desc.SetInput("Filter", filter_name); + op_desc.SetInput("Bias", bias_name); + op_desc.SetOutput("Output", {matched.at("bottom_pool_out")->arg()->name}); + op_desc.SetAttr("xpu", 1); + + auto* resnet50_stmt = matched.at("top_conv")->stmt(); + auto* scope = resnet50_stmt->op()->scope(); + for (size_t i = 0; i < filter_name.size(); ++i) { + auto* filter_t = scope->FindMutableTensor(filter_name[i]); + auto* scale_t = scope->FindMutableTensor(scale_name[i]); + auto* bias_t = scope->FindMutableTensor(bias_name[i]); + auto* mean_t = scope->FindMutableTensor(mean_name[i]); + auto* var_t = scope->FindMutableTensor(var_name[i]); + + int mean_len = mean_t->numel(); + int filter_len = filter_t->numel(); + int filter_stride = filter_len / mean_len; + + float* filter_on_host = filter_t->mutable_data(); + float* scale_on_host = scale_t->mutable_data(); + float* bias_on_host = bias_t->mutable_data(); + float* mean_on_host = mean_t->mutable_data(); + float* var_on_host = var_t->mutable_data(); + + // Perform preprocess + for (int i = 0; i < mean_len; ++i) { + scale_on_host[i] = scale_on_host[i] / sqrtf(var_on_host[i] + 0.00001f); + } + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; ++j) { + filter_on_host[i * filter_stride + j] *= scale_on_host[i]; + } + } + for (int i = 0; i < mean_len; ++i) { + bias_on_host[i] += -mean_on_host[i] * scale_on_host[i]; + } + + float max_f = + paddle::lite::xpu::math::FindMaxAbs(filter_on_host, filter_len); + std::unique_ptr filter_int16(new int16_t[filter_len]); + paddle::lite::xpu::math::ConvertFP32ToInt16( + filter_on_host, filter_int16.get(), max_f, filter_len); + memcpy(filter_on_host, filter_int16.get(), filter_len * sizeof(int16_t)); + + // create new arg in graph and scope + std::string max_name = filter_name[i] + "_max"; + max_filter_name.push_back(max_name); + auto* max_filter_node = graph->NewArgumentNode(max_name); + max_filter_node->arg()->is_weight = true; + max_filter_node->arg()->type = LiteType::GetTensorTy( + TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + DirectedLink(max_filter_node, matched.at("top_conv")); + auto* max_filter_t = scope->NewTensor(max_name); + max_filter_t->Resize({4}); + float* max_ptr = max_filter_t->mutable_data(); + max_ptr[0] = max_f; + max_ptr[1] = max_f; + max_ptr[2] = max_f; + max_ptr[3] = max_f; + } + op_desc.SetInput("MaxFilter", max_filter_name); + + auto resnet50_op = LiteOpRegistry::Global().Create(op_desc.Type()); + resnet50_op->Attach(op_desc, scope); + resnet50_op->SetValidPlaces(resnet50_stmt->op()->valid_places()); + auto kernels = resnet50_op->CreateKernels(resnet50_op->valid_places()); + resnet50_stmt->SetOp(resnet50_op); + resnet50_stmt->SetKernels(std::move(kernels)); + + IR_NODE_LINK_TO(matched.at("top_bn_bias"), matched.at("top_conv")); + for (auto* node : extra_input_vars) { + IR_NODE_LINK_TO(node, matched.at("top_conv")); + } + IR_OP_VAR_LINK(matched.at("top_conv"), matched.at("bottom_pool_out")); + } +}; + +} // namespace fusion + +class XPUResNet50FusePass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override { + if (GetBoolFromEnv("XPU_ENABLE_XTCL")) return; + fusion::XPUResNetBlock0Fuser block0_fuser; + block0_fuser(graph.get()); + fusion::XPUResNetBlock1Fuser block1_fuser; + block1_fuser(graph.get()); + fusion::XPUResNet50Fuser resnet50_fuser; + resnet50_fuser(graph.get()); + } +}; + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(__xpu__resnet_fuse_pass, + paddle::lite::mir::XPUResNet50FusePass) + .BindTargets({TARGET(kXPU)}) + .BindKernel("conv2d"); diff --git a/lite/core/mir/fusion/conv_bn_fuse_pass.cc b/lite/core/mir/fusion/conv_bn_fuse_pass.cc index f5a7837b53650e08f9632b499a4c2ab1faeaeedf..4393832931c95ca20e34ca3b3d2fb4501274b15f 100644 --- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc +++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc @@ -26,7 +26,8 @@ namespace mir { void ConvBNFusePass::Apply(const std::unique_ptr& graph) { // initialze fuser params std::vector conv_has_bias_cases{true, false}; - std::vector conv_type_cases{"conv2d", "depthwise_conv2d"}; + std::vector conv_type_cases{ + "conv2d", "depthwise_conv2d", "conv2d_transpose"}; // start fuse using params for (auto conv_has_bias : conv_has_bias_cases) { for (auto conv_type : conv_type_cases) { diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc index 0f5bb64e10dd61c3edf4ddd32569a2d365651cdf..43869beddd0af701d5f78ea047b30f6b136e6b3f 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.cc +++ b/lite/core/mir/fusion/conv_bn_fuser.cc @@ -103,14 +103,20 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { std::string conv_weight_name = matched.at("conv_weight")->arg()->name; auto conv_weight_t = scope->FindVar(conv_weight_name)->GetMutable(); - CHECK_EQ(static_cast(bn_scale_t->data_size()), - static_cast(conv_weight_t->dims()[0])) - << "The BN bias's size should be equal to the size of the first " - << "dim size of the conv weights"; + if (conv_type_ == "conv2d_transpose") { + CHECK_EQ(static_cast(bn_scale_t->data_size()), + static_cast(conv_weight_t->dims()[1])) + << "The BN bias's size should be equal to the size of the first " + << "dim size of the conv weights"; + } else { + CHECK_EQ(static_cast(bn_scale_t->data_size()), + static_cast(conv_weight_t->dims()[0])) + << "The BN bias's size should be equal to the size of the first " + << "dim size of the conv weights"; + } size_t weight_num = conv_weight_t->data_size(); bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; - bool is_weight_quantization = - conv_op_desc->HasAttr("quantize_weight_bits") ? true : false; + bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits"); // comupte BN alpha and beta Tensor alpha_tensor, beta_tensor; @@ -153,12 +159,29 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { // compute new conv_weight for int8 auto weight_scale = conv_op_desc->GetAttr>("weight_scale"); - for (unsigned int i = 0; i < h; ++i) { - weight_scale[i] *= fabsf(alpha_data[i]); - if (alpha_data[i] < 0.f) { - auto ptr_row = conv_weight_d + i * w; - for (unsigned int j = 0; j < w; ++j) { - ptr_row[j] *= -1; + if (conv_type_ == "conv2d_transpose") { + int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * + conv_weight_t->dims()[3]; + int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; + for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) { + for (unsigned int i = 0; i < h; ++i) { + weight_scale[i] *= fabsf(alpha_data[i]); + if (alpha_data[i] < 0.f) { + auto ptr_row = conv_weight_d + k * c_size + i * hw; + for (unsigned int j = 0; j < hw; ++j) { + ptr_row[j] *= -1; + } + } + } + } + } else { + for (unsigned int i = 0; i < h; ++i) { + weight_scale[i] *= fabsf(alpha_data[i]); + if (alpha_data[i] < 0.f) { + auto ptr_row = conv_weight_d + i * w; + for (unsigned int j = 0; j < w; ++j) { + ptr_row[j] *= -1; + } } } } @@ -176,9 +199,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } else { // compute new conv_weight auto conv_weight_d = conv_weight_t->mutable_data(); - for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels - for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels - conv_weight_d[i * w + j] *= alpha_data[i]; + if (conv_type_ == "conv2d_transpose") { + int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * + conv_weight_t->dims()[3]; + int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; + for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) { + for (unsigned int i = 0; i < h; ++i) { + auto ptr_row = conv_weight_d + k * c_size + i * hw; + for (unsigned int j = 0; j < hw; ++j) { + ptr_row[j] *= alpha_data[i]; + } + } + } + } else { + for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels + for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels + conv_weight_d[i * w + j] *= alpha_data[i]; + } } } } diff --git a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc index ab81f3d809507dd340056c97a39998c908a75dc7..80a033c75f2e23efa091375ee2a9f78e3ff40d71 100644 --- a/lite/core/mir/fusion/quant_dequant_fuse_pass.cc +++ b/lite/core/mir/fusion/quant_dequant_fuse_pass.cc @@ -44,11 +44,9 @@ void QuantDequantFusePass::Apply(const std::unique_ptr& graph) { fuser(graph.get()); } - // delete quant_dequant_node - for (auto op_type : {"pool2d", "softmax", "elementwise_add"}) { - fusion::DeleteQuantDequantOpFuser fuser(op_type); - fuser(graph.get()); - } + // process quant_dequant_node + fusion::DeleteQuantDequantOpFuser dqd_fuser; + dqd_fuser(graph.get()); } } // namespace mir diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index 7797864a2e4b75f52fd7da93ea81613a2175f423..a3a98b871fb4b6f8230299cda978b0f1f8faa779 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -50,7 +50,7 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph, auto* output_scale_node = matched.at("output_scale_node"); auto* output_act_node = matched.at("output_act_node"); - // obtain values, save values and relink node + // obtain scale, save attrs and relink node int bit_length = quant_node->stmt()->op_info()->GetAttr("bit_length"); int range = ((1 << (bit_length - 1)) - 1); auto* scope = quant_node->stmt()->op()->scope(); @@ -58,11 +58,22 @@ void DeleteQuantOpFuser::InsertNewNode(SSAGraph* graph, ->GetMutable(); float scale_value = scale_tensor->data()[0] / range; + auto in_act_name = input_act_node->arg()->name; + auto out_act_name = output_act_node->arg()->name; auto outlinks = output_act_node->outlinks; for (auto* quantized_node : outlinks) { - auto* op_desc = quantized_node->stmt()->mutable_op_info(); - op_desc->SetAttr("bit_length", bit_length); - op_desc->SetAttr("input_scale", scale_value); + // save input scale in quantized op by input argname + index + auto op_desc = *quantized_node->stmt()->mutable_op_info(); + std::string argname; + int index; + op_desc.GetInputArgname(out_act_name, &argname); + op_desc.GetInputIndex(out_act_name, &index); + op_desc.SetAttr(argname + std::to_string(index) + "_input_scale", + scale_value); + op_desc.SetAttr("input_scale", scale_value); // save it for now + op_desc.SetAttr("bit_length", bit_length); + op_desc.UpdateAllInputs(out_act_name, in_act_name); + quantized_node->stmt()->ResetOp(op_desc, graph->valid_places()); IR_NODE_LINK_TO(input_act_node, quantized_node) } @@ -125,19 +136,18 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, auto* dequant_op = matched.at("dequant_op"); auto* dequant_op_out = matched.at("dequant_op_out"); - // obtain input_scale and weight_scale + // obtain weight_scale from max_range auto* scope = quantized_op->stmt()->op()->scope(); auto& valid_places = quantized_op->stmt()->op()->valid_places(); int bit_length = quantized_op->stmt()->op_info()->GetAttr("bit_length"); int range = ((1 << (bit_length - 1)) - 1); - float input_scale = - quantized_op->stmt()->op_info()->GetAttr("input_scale"); float max_range = dequant_op->stmt()->op_info()->GetAttr("max_range"); float whole_weight_scale = static_cast(range * range) / max_range / range; - // max_range = range * range / max(abs(weight)) - // weight_scale = range * range / (range * range / max(abs(weight))) / range - // = max(abs(weight)) / range + // As: max_range = range * range / max(abs(weight)) + // So: whole_weight_scale + // = range * range / (range * range / max(abs(weight))) / range + // = max(abs(weight)) / range // set op desc cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); @@ -153,7 +163,7 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, // Conv weight shape: Cout * Cin * kh * hw, the weight_scale_size should // be Cout. weight_scale_size = quantized_weight_t->dims()[0]; - } else if (quantized_op_type_ == "mul") { + } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") { op_desc.SetInput("X", {quantized_op_input->arg()->name}); op_desc.SetOutput("Out", {dequant_op_out->arg()->name}); // Fc weight: Cin * Cout, the weight_scale_size should be Cout. @@ -163,7 +173,6 @@ void DequantOpFuser::InsertNewNode(SSAGraph* graph, weight_scale.push_back(whole_weight_scale); } op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("input_scale", input_scale); op_desc.SetAttr("weight_scale", weight_scale); // change the weight from the float type to int8 type. @@ -209,6 +218,7 @@ void ChannelWiseDequantOpFuser::BuildPattern() { ->assert_is_op_output(quantized_op_type_) ->assert_is_op_input(dequant_op_type, "X") ->AsIntermediate(); + // The scale var_node of input activation is deleted in DeleteQuantOpFuser auto* dequant_op_channel_scale = VarNode("dequant_op_channel_scale") ->assert_is_op_input(dequant_op_type) ->AsIntermediate(); @@ -237,11 +247,9 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, auto* dequant_op = matched.at("dequant_op"); auto* dequant_op_out = matched.at("dequant_op_out"); - // obtain input_scale and weight_scale + // obtain input weight_scale from fake_dequant op auto* scope = quantized_op->stmt()->op()->scope(); auto& valid_places = quantized_op->stmt()->op()->valid_places(); - float input_scale = - quantized_op->stmt()->op_info()->GetAttr("input_scale"); std::vector weight_scale; std::vector quant_bits = @@ -258,11 +266,15 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, // set op desc cpp::OpDesc op_desc = *quantized_op->stmt()->op_info(); - op_desc.SetInput("Input", {quantized_op_input->arg()->name}); - op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); - + if (quantized_op_type_ == "conv2d" || + quantized_op_type_ == "depthwise_conv2d") { + op_desc.SetInput("Input", {quantized_op_input->arg()->name}); + op_desc.SetOutput("Output", {dequant_op_out->arg()->name}); + } else if (quantized_op_type_ == "mul" || quantized_op_type_ == "matmul") { + op_desc.SetInput("X", {quantized_op_input->arg()->name}); + op_desc.SetOutput("Out", {dequant_op_out->arg()->name}); + } op_desc.SetAttr("enable_int8", true); - op_desc.SetAttr("input_scale", input_scale); op_desc.SetAttr("weight_scale", weight_scale); // change the weight from the float type to int8 type. @@ -297,167 +309,65 @@ cpp::OpDesc ChannelWiseDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { void DeleteQuantDequantOpFuser::BuildPattern() { std::string quant_dequant_op_type = "fake_quantize_dequantize_moving_average_abs_max"; - if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") { - auto* input_scale_node = - VarNode("input_scale_node") - ->assert_is_op_input(quant_dequant_op_type, "InScale"); - auto* input_act_node = VarNode("input_act_node") - ->assert_is_op_input(quant_dequant_op_type, "X"); - auto* quant_dequant_node = - OpNode("quant_dequant_node", quant_dequant_op_type) - ->assert_is_op(quant_dequant_op_type); - auto* output_scale_node = - VarNode("output_scale_node") - ->assert_is_op_output(quant_dequant_op_type, "OutScale"); - auto* output_act_node = - VarNode("output_act_node") - ->assert_is_op_output(quant_dequant_op_type, "Out"); - auto* quantized_node = OpNode("quantized_node", quantized_op_type_) - ->assert_is_op(quantized_op_type_); - - quant_dequant_node->LinksFrom({input_scale_node, input_act_node}); - output_scale_node->LinksFrom({quant_dequant_node}); - output_act_node->LinksFrom({quant_dequant_node}); - quantized_node->LinksFrom({output_act_node}); - } else if (quantized_op_type_ == "elementwise_add") { - auto* input_scale_left_node = - VarNode("input_scale_left_node") - ->assert_is_op_input(quant_dequant_op_type, "InScale"); - auto* input_act_left_node = - VarNode("input_act_left_node") - ->assert_is_op_input(quant_dequant_op_type, "X"); - auto* quant_dequant_left_node = - OpNode("quant_dequant_left_node", quant_dequant_op_type) - ->assert_is_op(quant_dequant_op_type); - auto* output_scale_left_node = - VarNode("output_scale_left_node") - ->assert_is_op_output(quant_dequant_op_type, "OutScale"); - auto* output_act_left_node = - VarNode("output_act_left_node") - ->assert_is_op_output(quant_dequant_op_type, "Out") - ->assert_is_op_input(quantized_op_type_, "X"); - quant_dequant_left_node->LinksFrom( - {input_scale_left_node, input_act_left_node}); - output_scale_left_node->LinksFrom({quant_dequant_left_node}); - output_act_left_node->LinksFrom({quant_dequant_left_node}); - - auto* input_scale_right_node = - VarNode("input_scale_right_node") - ->assert_is_op_input(quant_dequant_op_type, "InScale"); - auto* input_act_right_node = - VarNode("input_act_right_node") - ->assert_is_op_input(quant_dequant_op_type, "X"); - auto* quant_dequant_right_node = - OpNode("quant_dequant_right_node", quant_dequant_op_type) - ->assert_is_op(quant_dequant_op_type); - auto* output_scale_right_node = - VarNode("output_scale_right_node") - ->assert_is_op_output(quant_dequant_op_type, "OutScale"); - auto* output_act_right_node = - VarNode("output_act_right_node") - ->assert_is_op_output(quant_dequant_op_type, "Out") - ->assert_is_op_input(quantized_op_type_, "Y"); - quant_dequant_right_node->LinksFrom( - {input_scale_right_node, input_act_right_node}); - output_scale_right_node->LinksFrom({quant_dequant_right_node}); - output_act_right_node->LinksFrom({quant_dequant_right_node}); - - auto* quantized_node = OpNode("quantized_node", quantized_op_type_) - ->assert_is_op(quantized_op_type_); - quantized_node->LinksFrom({output_act_left_node, output_act_right_node}); - } else { - LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_; - } - VLOG(4) << "DeleteQuantDequantOpFuser BuildPattern op_type:" - << quantized_op_type_; + auto* input_scale_node = + VarNode("input_scale_node") + ->assert_is_op_input(quant_dequant_op_type, "InScale"); + auto* input_act_node = + VarNode("input_act_node")->assert_is_op_input(quant_dequant_op_type, "X"); + auto* quant_dequant_node = OpNode("quant_dequant_node", quant_dequant_op_type) + ->assert_is_op(quant_dequant_op_type); + auto* output_scale_node = + VarNode("output_scale_node") + ->assert_is_op_output(quant_dequant_op_type, "OutScale"); + auto* output_act_node = + VarNode("output_act_node") + ->assert_is_op_output(quant_dequant_op_type, "Out"); + + quant_dequant_node->LinksFrom({input_scale_node, input_act_node}); + output_scale_node->LinksFrom({quant_dequant_node}); + output_act_node->LinksFrom({quant_dequant_node}); } void DeleteQuantDequantOpFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { - if (quantized_op_type_ == "pool2d" || quantized_op_type_ == "softmax") { - auto* input_scale_node = matched.at("input_scale_node"); - auto* input_act_node = matched.at("input_act_node"); - auto* quant_dequant_node = matched.at("quant_dequant_node"); - auto* output_scale_node = matched.at("output_scale_node"); - auto* output_act_node = matched.at("output_act_node"); - auto* quantized_node = matched.at("quantized_node"); - - // obtain values, save values and relink node - int bit_length = - quant_dequant_node->stmt()->op_info()->GetAttr("bit_length"); - int range = ((1 << (bit_length - 1)) - 1); - auto* scope = quant_dequant_node->stmt()->op()->scope(); - auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name) - ->GetMutable(); - float scale_value = scale_tensor->data()[0] / range; - - auto* op_desc = quantized_node->stmt()->mutable_op_info(); - op_desc->SetAttr("bit_length", bit_length); - op_desc->SetAttr("input_scale", scale_value); - op_desc->SetInput("X", {input_act_node->arg()->name}); - IR_NODE_LINK_TO(input_act_node, quantized_node) - auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); - quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); - - // delete nodes and edges - std::unordered_set nodes2rm = {input_scale_node, - quant_dequant_node, - output_scale_node, - output_act_node}; - GraphSafeRemoveNodes(graph, nodes2rm); - } else if (quantized_op_type_ == "elementwise_add") { - auto* input_scale_left_node = matched.at("input_scale_left_node"); - auto* input_act_left_node = matched.at("input_act_left_node"); - auto* quant_dequant_left_node = matched.at("quant_dequant_left_node"); - auto* output_scale_left_node = matched.at("output_scale_left_node"); - auto* output_act_left_node = matched.at("output_act_left_node"); - - auto* input_scale_right_node = matched.at("input_scale_right_node"); - auto* input_act_right_node = matched.at("input_act_right_node"); - auto* quant_dequant_right_node = matched.at("quant_dequant_right_node"); - auto* output_scale_right_node = matched.at("output_scale_right_node"); - auto* output_act_right_node = matched.at("output_act_right_node"); - - auto* quantized_node = matched.at("quantized_node"); - - // obtain values, save values and relink node - int bit_length = - quant_dequant_left_node->stmt()->op_info()->GetAttr("bit_length"); - int range = ((1 << (bit_length - 1)) - 1); - auto* scope = quant_dequant_left_node->stmt()->op()->scope(); - auto* left_scale_tensor = - scope->FindVar(output_scale_left_node->arg()->name) - ->GetMutable(); - float left_scale_value = left_scale_tensor->data()[0] / range; - auto* right_scale_tensor = - scope->FindVar(output_scale_right_node->arg()->name) - ->GetMutable(); - float right_scale_value = right_scale_tensor->data()[0] / range; - - auto* op_desc = quantized_node->stmt()->mutable_op_info(); - op_desc->SetAttr("bit_length", bit_length); - op_desc->SetAttr("x_input_scale", left_scale_value); - op_desc->SetAttr("y_input_scale", right_scale_value); - op_desc->SetInput("X", {input_act_left_node->arg()->name}); - op_desc->SetInput("Y", {input_act_right_node->arg()->name}); - IR_NODE_LINK_TO(input_act_left_node, quantized_node) - IR_NODE_LINK_TO(input_act_right_node, quantized_node) - auto update_op_desc = *quantized_node->stmt()->mutable_op_info(); - quantized_node->stmt()->ResetOp(update_op_desc, graph->valid_places()); - - // delete nodes and edges - std::unordered_set nodes2rm = {input_scale_left_node, - quant_dequant_left_node, - output_scale_left_node, - output_act_left_node, - input_scale_right_node, - quant_dequant_right_node, - output_scale_right_node, - output_act_right_node}; - GraphSafeRemoveNodes(graph, nodes2rm); - } else { - LOG(FATAL) << "No support quantized_op_type:" << quantized_op_type_; + auto* input_scale_node = matched.at("input_scale_node"); + auto* input_act_node = matched.at("input_act_node"); + auto* quant_dequant_node = matched.at("quant_dequant_node"); + auto* output_scale_node = matched.at("output_scale_node"); + auto* output_act_node = matched.at("output_act_node"); + auto input_act_name = input_act_node->arg()->name; + auto output_act_name = output_act_node->arg()->name; + + // Get scale value from scale var node + int bit_length = + quant_dequant_node->stmt()->op_info()->GetAttr("bit_length"); + int range = ((1 << (bit_length - 1)) - 1); + auto* scope = quant_dequant_node->stmt()->op()->scope(); + auto* scale_tensor = scope->FindVar(output_scale_node->arg()->name) + ->GetMutable(); + float scale_value = scale_tensor->data()[0] / range; + + auto quantized_nodes = output_act_node->outlinks; + for (auto* quantized_node : quantized_nodes) { + // Save quantization info in op_info attr + auto op_info = *quantized_node->stmt()->op_info(); + std::string argname; + int index; + op_info.GetInputArgname(output_act_name, &argname); + op_info.GetInputIndex(output_act_name, &index); + op_info.SetAttr(argname + std::to_string(index) + "_input_scale", + scale_value); + op_info.SetAttr("input_scale", scale_value); // Save it for now + op_info.SetAttr("bit_length", bit_length); + + op_info.UpdateAllInputs(output_act_name, input_act_name); + quantized_node->stmt()->ResetOp(op_info, graph->valid_places()); + IR_NODE_LINK_TO(input_act_node, quantized_node); } + // delete nodes and edges + std::unordered_set nodes2rm = { + input_scale_node, quant_dequant_node, output_scale_node, output_act_node}; + GraphSafeRemoveNodes(graph, nodes2rm); } cpp::OpDesc DeleteQuantDequantOpFuser::GenOpDesc(const key2nodes_t& matched) { diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.h b/lite/core/mir/fusion/quant_dequant_op_fuser.h index bef9f4d9573d049700736c166cd0d31b668f7eff..ac3ac112b3aa504bc075125f2f13292073ca9444 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.h +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.h @@ -87,24 +87,16 @@ class ChannelWiseDequantOpFuser : public FuseBase { }; /* The pattern like "fake_quantize_dequantize_moving_average_abs_max + - * pooled/elementwise_add" can be deteted by this fuser. The fuser - * extract the input_scale form fake_quant_dequant_op and save into - * the quantized_op. Besides, the fuser delete fake_quant_dequant_op in - * the graph. + * quantized_op" can be deteted by this fuser. The fuser modifies the input + * scale for the quantized_op and deletes the fake_quant_dequant_op. */ - class DeleteQuantDequantOpFuser : public FuseBase { public: - explicit DeleteQuantDequantOpFuser(const std::string& quantized_op_type) - : quantized_op_type_(quantized_op_type) {} void BuildPattern() override; void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override; private: cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override; - - private: - std::string quantized_op_type_{}; }; } // namespace fusion diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc index 76c97d2da6ed9e7c6fc1f1889d80095278b68ec0..d7486c0933dbbe74115bd6358962817b2b946c12 100644 --- a/lite/core/mir/generate_program_pass.cc +++ b/lite/core/mir/generate_program_pass.cc @@ -14,6 +14,7 @@ #include "lite/core/mir/generate_program_pass.h" #include +#include #include #include #include "lite/core/mir/graph_visualize_pass.h" @@ -25,10 +26,37 @@ namespace mir { void GenerateProgramPass::Apply(const std::unique_ptr& graph) { VLOG(4) << "final program \n" << Visualize(graph.get()); - for (auto& item : graph->StmtTopologicalOrder()) { + std::vector nodes_in_order; +#ifdef LITE_WITH_CUDA + const std::string depend_pass = "multi_stream_analysis_pass"; + const std::string attr_name = "nodes_in_order"; + mir::Pass* pass = mir::PassManager::Global().LookUp(depend_pass); + if (pass->HasAttr(attr_name)) { + nodes_in_order = pass->GetAttr>(attr_name); + } +#endif + if (nodes_in_order.empty()) { + nodes_in_order = graph->StmtTopologicalOrder(); + } + + for (auto& item : nodes_in_order) { if (item->IsStmt()) { auto& stmt = item->AsStmt(); VLOG(4) << stmt; +#ifdef LITE_WITH_CUDA + if (stmt.kernels().front()->target() == TargetType::kCUDA) { + stmt.kernels() + .front() + ->mutable_context() + ->As() + .SetNeedSync(stmt.need_sync_); + stmt.kernels() + .front() + ->mutable_context() + ->As() + .SetSyncStreams(stmt.sync_streams_); + } +#endif insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); } } diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc index 28ec814fa85451b5292bfde6bddc6b64b57b2f08..55b7a004567ec5a5298e084839d6dcf5a8591882 100644 --- a/lite/core/mir/graph_visualize_pass.cc +++ b/lite/core/mir/graph_visualize_pass.cc @@ -26,15 +26,13 @@ namespace paddle { namespace lite { namespace mir { -using inference::analysis::Dot; - void GraphVisualizePass::Apply(const std::unique_ptr& graph) { VLOG(5) << "\n" << Visualize(graph.get()); } std::string Visualize(mir::SSAGraph* graph) { std::ostringstream os; - inference::analysis::Dot dot; + Dot dot; auto string_trunc = [](const std::string& str) -> std::string { const int max_disp_size = 100; if (str.length() > max_disp_size) @@ -87,7 +85,23 @@ std::string Visualize(mir::SSAGraph* graph) { if (!node->IsStmt()) continue; auto op_info = node->AsStmt().op_info(); auto op_type = op_info->Type(); - std::string op_name = string_format("%s%d", op_type.c_str(), op_idx++); + std::string op_name; + if (node->AsStmt().need_sync_) { + std::ostringstream oss; + for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) { + oss << std::to_string(node->AsStmt().sync_streams_[i]); + if (i != node->AsStmt().sync_streams_.size() - 1) { + oss << ","; + } + } + op_name = string_format("%s%d, stream=%d, sync_streams={%s}", + op_type.c_str(), + op_idx++, + node->AsStmt().stream_id_, + oss.str().c_str()); + } else { + op_name = string_format("%s%d", op_type.c_str(), op_idx++); + } // Add its input&output variables as the Dot nodes dot.AddNode(op_name, {Dot::Attr("shape", "box"), @@ -95,7 +109,13 @@ std::string Visualize(mir::SSAGraph* graph) { Dot::Attr("color", "black"), Dot::Attr("fillcolor", "yellow")}); for (auto& x : node->inlinks) { - auto var_name = x->AsArg().name; + std::string var_name; + if (x->AsArg().lane != -1) { + var_name = string_format( + "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane); + } else { + var_name = x->AsArg().name; + } if (!exists_var_names.count(var_name)) { dot.AddNode(var_name, {}); exists_var_names.insert(var_name); @@ -103,7 +123,13 @@ std::string Visualize(mir::SSAGraph* graph) { dot.AddEdge(var_name, op_name, {}); } for (auto& x : node->outlinks) { - auto var_name = x->AsArg().name; + std::string var_name; + if (x->AsArg().lane != -1) { + var_name = string_format( + "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane); + } else { + var_name = x->AsArg().name; + } if (!exists_var_names.count(var_name)) { dot.AddNode(var_name, {}); exists_var_names.insert(var_name); diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 38293ede76ed35bf05767ce1333947b7dfdbc4ac..6c7a7c5803268f0729be3a1d2164c0598c8738bd 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -313,4 +313,4 @@ void MemoryOptimizePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) .BindTargets({TARGET(kARM), TARGET(kOpenCL)}) - .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)}); + .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)}); diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc index 515eab9d3f20ebf85c2c5abad6d84f109ec68068..03d3c5056031c0604e706157fd509508dcd5ea8d 100644 --- a/lite/core/mir/mlu_postprocess_pass.cc +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -562,20 +562,22 @@ void MLUPostprocessPass::ModifyLayout(SSAGraph* graph) { } void MLUPostprocessPass::Apply(const std::unique_ptr& graph) { - // currently for non-persistent input and output args, mlu subgraph op - // only support float16/float32 data type - - // in two situations as folllows: - // 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; - // arg_in and arg_out are assumed to be NHWC which user should be aware of. - // Thus here we change these args' layout to NHWC - if (lite::TargetWrapperMlu::InputLayout() == DATALAYOUT(kNHWC)) { +// currently for non-persistent input and output args, mlu subgraph op +// only support float16/float32 data type + +// in two situations as folllows: +// 1: feed->arg_in->subgraph->... 2: ...->subgraph->arg_out->fetch; +// arg_in and arg_out are assumed to be NHWC which user should be aware of. +// Thus here we change these args' layout to NHWC +#ifdef LITE_WITH_MLU + if (lite::DeviceInfo::Global().InputLayout() == DATALAYOUT(kNHWC)) { ModifyLayout(graph.get()); } - if (lite::TargetWrapperMlu::UseFirstConv()) { + if (lite::DeviceInfo::Global().UseFirstConv()) { GatherAndModifyFirstConvNodes(graph.get()); } +#endif // insert io_copy, layout and precision cast of subgraph's inputs and outputs for (auto& node : graph->mutable_nodes()) { diff --git a/lite/core/mir/multi_stream_analysis_pass.cc b/lite/core/mir/multi_stream_analysis_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..46454a1fc357c7d96162a58a43a6c34bc890bc69 --- /dev/null +++ b/lite/core/mir/multi_stream_analysis_pass.cc @@ -0,0 +1,313 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/multi_stream_analysis_pass.h" + +#include +#include +#include +#include + +#include "lite/core/device_info.h" +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace mir { + +void MultiStreamAnalysisPass::CleanUp() { + exec_ops_.clear(); + wait_que_.clear(); + wait_que_cpu_.clear(); + std::queue empty_queue; + while (!exec_que_.empty()) { + exec_que_.pop(); + } + ops_in_streams_.clear(); + resources_.clear(); + map_arg_to_lane_.clear(); + op_types_set_.clear(); + io_copy_once_num_ = 0; +} + +void MultiStreamAnalysisPass::Init(SSAGraph* graph) { + // If not cleaned, the clone will overlay the previous state + CleanUp(); + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (op_node->IsStmt()) { + // Set all outputs of op to inaccessible state. + auto outputs = op_node->outlinks; + for (Node* node : outputs) { + CHECK(node->IsArg()); + auto& arg = node->AsArg(); + if (!resources_.count(arg.name)) { + resources_[arg.name] = false; + } + } + // Set the weight input of op to be accessible. + auto inputs = op_node->inlinks; + for (Node* node : inputs) { + CHECK(node->IsArg()); + auto& arg = node->AsArg(); + if (arg.is_weight || arg.is_persist) { + resources_[arg.name] = true; + } + } + + // feed and io_copy_once op has no dependencies and can be launched + // directly. Other ops are put into the waiting queue. + if (op_node->AsStmt().op_type() == "feed" || + op_node->AsStmt().op_type() == "io_copy_once") { + exec_que_.push(op_node); + } else { + auto tgt = op_node->AsStmt().kernels().front()->target(); + if (tgt == TargetType::kCUDA) { + wait_que_.push_back(op_node); + } else { + wait_que_cpu_.push_back(op_node); + } + } + op_types_set_.insert(op_node->AsStmt().op_type()); + } + } + + // Set the stream id according to the number of feed ops, and set the output + // of the feed op to be accessible. + int lane = 0; + auto nodes = graph->inputs(); + ops_in_streams_.resize(max_stream_); + + for (auto& node : nodes) { + std::string::size_type idx = node->AsArg().name.find("feed"); + if (idx != std::string::npos) { + for (auto& feed_ops : node->outlinks) { + if (feed_ops->AsStmt().op_type() == "feed") { + // feed op doesn't need to wait sync. + feed_ops->AsStmt().need_sync_ = false; + CHECK_EQ(static_cast(feed_ops->outlinks.size()), 1) + << "feed op must have one output."; + for (auto& var : feed_ops->outlinks) { + var->AsArg().lane = lane; + map_arg_to_lane_[var->AsArg().name] = lane; + resources_[var->AsArg().name] = true; + } + feed_ops->AsStmt().stream_id_ = lane; + ops_in_streams_[lane].push_back(feed_ops); + ++lane; + if (lane >= max_stream_) { + lane = 0; + } + } + } + } + // set all io_copy_once op in the first stream + for (auto& io_copy_once_ops : node->outlinks) { + if (io_copy_once_ops->AsStmt().op_type() == "io_copy_once") { + ops_in_streams_[0].push_back(io_copy_once_ops); + io_copy_once_ops->AsStmt().stream_id_ = 0; + io_copy_once_ops->AsStmt().need_sync_ = false; + ++io_copy_once_num_; + } + } + } +} + +bool MultiStreamAnalysisPass::CheckOpSupport() { + std::unordered_set invalid_op = { + "while", "conditional_block", "conditional_block_infer", "graph_op"}; + for (auto& op_type : op_types_set_) { + if (invalid_op.count(op_type)) { + LOG(INFO) << "multi_stream_analysis_pass don't support " << op_type + << ", just return."; + return false; + } + } + return true; +} + +bool MultiStreamAnalysisPass::IsPrepared(Node* stmt_node) { + // feed op are prepared when init. + std::string op_name = stmt_node->AsStmt().op_type(); + if (op_name == "feed") { + return true; + } + + // Check is op's input are all accessible. + std::vector args; + for (auto* ins : stmt_node->inlinks) { + args.push_back(ins->AsArg().name); + } + return CheckAccess(args); +} + +bool MultiStreamAnalysisPass::CheckAccess( + const std::vector& args) { + if (args.size() == 0) { + return true; + } + for (auto& name : args) { + if (resources_[name]) { + continue; + } else { + return false; + } + } + return true; +} + +int MultiStreamAnalysisPass::SelectStreamId(const std::vector& lanes) { + if (lanes.size() == 0) { + return 0; + } + + int res = lanes[0]; + int exclude_io_copy_once_num = ops_in_streams_[0].size() - io_copy_once_num_; + int min_num = lanes[0] == 0 ? exclude_io_copy_once_num + : ops_in_streams_[lanes[0]].size(); + for (size_t i = 1; i < lanes.size(); ++i) { + int ith_num = lanes[i] == 0 ? exclude_io_copy_once_num + : ops_in_streams_[lanes[i]].size(); + if (ith_num < min_num) { + res = lanes[i]; + min_num = ith_num; + } + } + + return res; +} + +void MultiStreamAnalysisPass::Launch(Node* stmt_node) { + // record ops launch order. + exec_que_.push(stmt_node); + std::vector lanes; + for (auto& in_arg : stmt_node->inlinks) { + // Weight parameter does not involve stream id, so just skip it. + if (in_arg->AsArg().is_weight || in_arg->AsArg().is_persist) { + continue; + } + + if (std::find(lanes.begin(), lanes.end(), in_arg->AsArg().lane) == + lanes.end()) { + lanes.push_back(in_arg->AsArg().lane); + } + } + + int stream_id = SelectStreamId(lanes); + + // If all inputs of the op are on multiple streams, they need to be + // synchronized + if (lanes.size() > 1) { + for (size_t i = 0; i < lanes.size(); ++i) { + if (lanes[i] != stream_id) { + stmt_node->AsStmt().sync_streams_.push_back(lanes[i]); + } + } + stmt_node->AsStmt().need_sync_ = true; + } + // io_copy are nodes inserted across devices and need to be synced. + if (stmt_node->AsStmt().op_type() == "io_copy") { + stmt_node->AsStmt().need_sync_ = true; + } + stmt_node->AsStmt().stream_id_ = stream_id; + + // set output lane and set the output of op to be accessible. + for (auto& out_arg : stmt_node->outlinks) { + out_arg->AsArg().lane = stream_id; + resources_[out_arg->AsArg().name] = true; + } + ops_in_streams_[stream_id].push_back(stmt_node); +} + +void MultiStreamAnalysisPass::Apply(const std::unique_ptr& graph) { +#ifdef LITE_WITH_CUDA + typename Env::Devs& devs = + Env::Global(); + int dev_id = TargetWrapper::GetCurDevice(); + max_stream_ = devs[dev_id].max_stream(); +#else + LOG(FATAL) << "Please re-compile by setting the cmake flag LITE_WITH_CUDA=ON"; +#endif + + // Find the correct startup sequence for op. + Init(graph.get()); + bool is_valid = CheckOpSupport(); + if (!is_valid) { + return; + } + size_t prev_size; + + while (!(this->wait_que_.empty() && this->wait_que_cpu_.empty())) { + prev_size = this->wait_que_.size() + this->wait_que_cpu_.size(); + // launch the acessible cuda kernel and remove it from wait que. + for (auto it = this->wait_que_.begin(); it != this->wait_que_.end();) { + if (IsPrepared(*it)) { + Launch(*it); + it = wait_que_.erase(it); + } else { + ++it; + } + } + // launch the accessible cpu kernel and remove it from wait que. + for (auto cpu_it = this->wait_que_cpu_.begin(); + cpu_it != this->wait_que_cpu_.end();) { + if (IsPrepared(*cpu_it)) { + Launch(*cpu_it); + cpu_it = wait_que_cpu_.erase(cpu_it); + } else { + ++cpu_it; + } + } + + if (this->wait_que_.size() + this->wait_que_cpu_.size() == prev_size) { + LOG(FATAL) << "network topo error!"; + } + } + + // Get exec ops order. + while (!exec_que_.empty()) { + auto* node = exec_que_.front(); + exec_ops_.push_back(node); + VLOG(4) << node->AsStmt().op_type() + << " stream: " << node->AsStmt().stream_id_ + << ", sync: " << node->AsStmt().need_sync_; + if (node->AsStmt().need_sync_) { + for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) { + VLOG(4) << " " << node->AsStmt().sync_streams_[i]; + } + } + exec_que_.pop(); + } + + // Set attribute parameters, for passing parameters between passes + const std::string attr_name{"nodes_in_order"}; + SetAttr>(attr_name, &exec_ops_); + + LOG(INFO) << "stream " << 0 << " has " + << ops_in_streams_[0].size() - io_copy_once_num_ + << " ops. (exclude io_copy_once)."; + for (size_t i = 1; i < ops_in_streams_.size(); ++i) { + LOG(INFO) << "stream " << i << " has " << ops_in_streams_[i].size() + << " ops."; + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(multi_stream_analysis_pass, + paddle::lite::mir::MultiStreamAnalysisPass) + .BindTargets({TARGET(kCUDA)}); diff --git a/lite/core/mir/multi_stream_analysis_pass.h b/lite/core/mir/multi_stream_analysis_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..37a7feca3a1200ad7ff26ef8fc0317deee9d174e --- /dev/null +++ b/lite/core/mir/multi_stream_analysis_pass.h @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lite/core/kernel.h" +#include "lite/core/mir/pass.h" + +namespace paddle { +namespace lite { +namespace mir { + +/* + * MultiStreamAnalysisPass will find the correct launch sequence for all ops. + * Ideally, the order should be multiple asynchronous ops and a small number of + * synchronous ops. + */ +class MultiStreamAnalysisPass : public StmtPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + // Init resource list. Set all ops except feed to inaccessible state and set + // stream id according to the numer of inputs. + void Init(SSAGraph* graph); + + // Clean state information of all member variables. + void CleanUp(); + + // After launching, unlock the output resources of op. + void Launch(Node* stmt_node); + + // If all inputs of an op are accessible, the op is considered to be in the + // prepared state + bool IsPrepared(Node* stmt_node); + + // Determine if all inputs of op are accessible. + bool CheckAccess(const std::vector& args); + + // The logic of selecting a stream: + // 1. Make the number of ops on each stream as close as possible. + // 2. The selected stream must be one of the streams contained in the input + // arg + int SelectStreamId(const std::vector& lanes); + + // Check if the model's ops are all supported. If you encounter unsupported + // ops, exit + bool CheckOpSupport(); + + private: + std::list wait_que_; + std::list wait_que_cpu_; + std::queue exec_que_; + std::vector exec_ops_; + std::vector> ops_in_streams_; + std::unordered_map resources_; + std::unordered_map map_arg_to_lane_; + int max_stream_; + int io_copy_once_num_; + std::unordered_set op_types_set_; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h index 45b15812fadb0789edea3f89fb00b4612bdb010f..ae7b112d9157de3f53c409dfc89bf1273531e05f 100644 --- a/lite/core/mir/node.h +++ b/lite/core/mir/node.h @@ -80,6 +80,12 @@ class Node { // Description. std::string desc; + + // for cuda multi stream + bool need_sync_{false}; + int stream_id_{0}; + // streams which need to be sync. exclude stream_id_ + std::vector sync_streams_{}; }; struct Arg { @@ -93,6 +99,7 @@ class Node { // if the need more than one tool operator(eg. io_copy layout calib), the // argument between them should be persist to make sure it's only run once bool is_persist{false}; + int lane{-1}; }; Arg& AsArg(const std::string& name, int id); diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h index 4e8c8be292bbd5e7f46664378634d4f1aeed2965..64f2db82c0b1b0b863c1aa61b3b2affea5f85d89 100644 --- a/lite/core/mir/pass.h +++ b/lite/core/mir/pass.h @@ -17,9 +17,11 @@ #include #include #include +#include #include "lite/core/mir/node.h" #include "lite/core/mir/ssa_graph.h" +#include "lite/utils/varient.h" namespace paddle { namespace lite { @@ -121,6 +123,27 @@ class Pass { virtual ~Pass() = default; + bool HasAttr(const std::string& attr_name) const { + return pass_attrs_.count(attr_name) > 0; + } + + // Set a pointer to the attribute. Specific pass itself takes ownership of the + // attribute. + template + void SetAttr(const std::string& attr_name, const AttrType* attr) { + VLOG(4) << "Setting the attribute " << attr_name << " for the pass " + << name_; + pass_attrs_[attr_name].set(*attr); + } + + // Get a reference to the attribute previously set. + template + const AttrType& GetAttr(const std::string& attr_name) const { + CHECK(pass_attrs_.count(attr_name)) + << attr_name << " attr not register for pass " << name_; + return pass_attrs_.at(attr_name).get(); + } + private: const Kind kind_; std::string name_; @@ -128,6 +151,8 @@ class Pass { std::set bound_targets_; std::set excluded_targets_; std::unordered_map> bound_kernels_; + std::unordered_map>> + pass_attrs_; }; // Different kinds. diff --git a/lite/core/mir/pass_registry.h b/lite/core/mir/pass_registry.h index 849f80aea2191b72ac423c7125a4e69cb6927be5..170de1cd31ffd31662eb98898ad795993a36289e 100644 --- a/lite/core/mir/pass_registry.h +++ b/lite/core/mir/pass_registry.h @@ -59,6 +59,9 @@ class PassRegistry { } // namespace lite } // namespace paddle +// some platform-independent defintion +#include "lite/utils/macros.h" + #define REGISTER_MIR_PASS(name__, class__) \ paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__, \ new class__); \ @@ -66,4 +69,4 @@ class PassRegistry { return mir_pass_registry##name__.Touch(); \ } \ static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__ \ - __attribute__((unused)) = mir_pass_registry##name__ + UNUSED = mir_pass_registry##name__ diff --git a/lite/core/mir/pattern_matcher.cc b/lite/core/mir/pattern_matcher.cc index b625919cbfb6d26ecbbd1bad36772aff86bee087..aaebf852b2ec519515e59655a57600f59ec6a2c3 100644 --- a/lite/core/mir/pattern_matcher.cc +++ b/lite/core/mir/pattern_matcher.cc @@ -322,7 +322,6 @@ void PatternMatcher::RemoveOverlappedMatch(std::vector *subgraphs) { } std::string PMPattern::DotString() const { - using inference::analysis::Dot; Dot dot; int id = 0; // Create Nodes diff --git a/lite/core/mir/pattern_matcher_high_api.h b/lite/core/mir/pattern_matcher_high_api.h index e62a4fc7494d750b2b5331c4b54b787df239ceee..3ac8e331aacb28044fca7f328319de37b27950bf 100644 --- a/lite/core/mir/pattern_matcher_high_api.h +++ b/lite/core/mir/pattern_matcher_high_api.h @@ -64,7 +64,6 @@ class FuseBase { protected: virtual void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) = 0; - private: void PerformPatternMatcher(SSAGraph* graph); // Delete nodes that are marked as Intermediate diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc index 40cad8f6af75300ab85753b16e391daeeadc6c2f..37fff018caf4a6d90a48ad3f173ec28c09866690 100644 --- a/lite/core/mir/quantized_op_attributes_inference_pass.cc +++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc @@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply( REGISTER_MIR_PASS(quantized_op_attributes_inference_pass, paddle::lite::mir::QuantizedOpAttributesInferencePass) - .BindTargets({TARGET(kNPU)}); + .BindTargets({TARGET(kNPU), TARGET(kRKNPU)}); diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc index 97c4819eaf6734ba9b374444166d17cb15e8ae65..5b6f968484b7b49838a004c3edfd00ff9b7e5e5e 100644 --- a/lite/core/mir/runtime_context_assign_pass.cc +++ b/lite/core/mir/runtime_context_assign_pass.cc @@ -24,11 +24,32 @@ class RuntimeContextAssignPass : public StmtPass { RuntimeContextAssignPass() {} void Apply(const std::unique_ptr& graph) override { +#ifdef LITE_WITH_OPENCL + using OpenCLContext = Context; + std::unique_ptr local_ctx(new KernelContext()); + local_ctx->As().InitOnce(); +#endif for (auto& node : graph->mutable_nodes()) { if (!node.IsStmt()) continue; auto& inst = node.AsStmt(); - inst.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(inst.picked_kernel().target())); + +#ifdef LITE_WITH_OPENCL + if (inst.picked_kernel().target() == TARGET(kOpenCL)) { + std::unique_ptr ctx(new KernelContext()); + (*local_ctx) + .As() + .CopySharedTo(&ctx->As()); + inst.picked_kernel().SetContext(std::move(ctx)); + } else { + inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + inst.picked_kernel().target())); + } +#else + int stream_id = inst.stream_id_; + + inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + inst.picked_kernel().target(), stream_id)); +#endif } } }; diff --git a/lite/core/mir/ssa_graph.cc b/lite/core/mir/ssa_graph.cc index a2fe31cca72b02b1ac97dac37d51cebb5bb89128..54f5f4d46ce465d9db78b43f339296a3135c9507 100644 --- a/lite/core/mir/ssa_graph.cc +++ b/lite/core/mir/ssa_graph.cc @@ -251,9 +251,10 @@ std::vector SSAGraph::outputs() { } mir::Node *SSAGraph::RetrieveArgument(const std::string &arg) { - auto it = arguments_.find(arg); - if (it != arguments_.end()) { - return it->second; + for (auto &node : node_storage_) { + if (node.IsArg() && node.arg()->name == arg) { + return &node; + } } return nullptr; } diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index 9799cc72437c7581bde681ef2e80c0234635c2fe..b61f7f365f51a32e267dd12943be5fcfadb3e08a 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -30,10 +30,8 @@ namespace paddle { namespace lite { namespace mir { -using inference::analysis::Dot; - std::string SubgraphVisualizer::operator()() { - inference::analysis::Dot dot; + Dot dot; const std::vector subgraph_colors{ "red", "green", "cyan", "bisque3", "coral", "darkseagreen1", "goldenrod1", "darkorchid", @@ -314,14 +312,14 @@ void SubgraphDetector::InitNodes(node_map_t *nodes) { std::vector> SubgraphDetector::ExtractSubgraphs( node_map_t *nodes) { - for (auto &n_tpo : graph_->NodeTopologicalOrder()) { + for (auto &ordered_node : graph_->NodeTopologicalOrder()) { // different orders when traversing nodes in graph may lead to // different subgraph division, which may generate different result // with device such as MLU. These different results are all "right" // but a little confusing. Thus the topological order is used instead // of the address of the node in graph. - CHECK(nodes->find(n_tpo) != nodes->end()); - node_dat_t *node = (*nodes)[n_tpo]; + CHECK(nodes->find(ordered_node) != nodes->end()); + node_dat_t *node = (*nodes)[ordered_node]; if (!node->marked) { continue; } diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc index 974772a9839c1e089359be3ae98e1833645ccd7a..1e54e1497b5d49754a705340aafa30ded1c2a727 100644 --- a/lite/core/mir/subgraph/subgraph_detector_test.cc +++ b/lite/core/mir/subgraph/subgraph_detector_test.cc @@ -200,7 +200,7 @@ TEST(Subgraph, detect_custom_model) { #ifdef LITE_WITH_NPU Place{TARGET(kNPU), PRECISION(kFloat)}, #endif -#ifdef LITE_WITH_XPU +#ifdef LITE_WITH_XTCL Place{TARGET(kXPU), PRECISION(kFloat)}, #endif }); diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index 45cf142f2f831fae11b6258b78dc24818c3a8988..5c5dc3204b8728e8b30661fae21b056db6960179 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -20,6 +20,7 @@ #include #include "lite/core/mir/pass_registry.h" #include "lite/core/mir/subgraph/subgraph_detector.h" +#include "lite/utils/env.h" namespace paddle { namespace lite { @@ -40,6 +41,7 @@ void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { } void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { + if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return; std::unordered_set supported_lists; #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #include "lite/kernels/xpu/bridges/paddle_use_bridges.h" @@ -67,6 +69,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void RKNPUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); @@ -91,5 +107,7 @@ REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) .BindTargets({TARGET(kXPU)}); REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) .BindTargets({TARGET(kBM)}); +REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass) + .BindTargets({TARGET(kRKNPU)}); REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass) .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index f83448df42ffe6d6d8c5b37503b5127290037dce..b89c20f3bd4b7ca8e9650d20925f5b75dc26ec59 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -37,6 +37,11 @@ class BMSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class RKNPUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + class MLUSubgraphPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index 7117e1b3399fe823194f7f1a4d4c239099580955..a2369adc5d882310503cbf52fa5394098d824b40 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -180,7 +180,7 @@ TEST(Subgraph, generate_model_and_check_precision) { #ifdef LITE_WITH_NPU valid_places.push_back(lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}); #endif -#ifdef LITE_WITH_XPU +#ifdef LITE_WITH_XTCL valid_places.push_back(lite_api::Place{TARGET(kXPU), PRECISION(kFloat)}); #endif auto tar_predictor = TestModel(FLAGS_model_dir, diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index ecccf89fa76287a3f30756f7138fcce229e8f337..121e64dc188eeb638becec3506b514bc24dad16d 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -80,7 +80,7 @@ static bool InferScaleFromSubgraph(std::string var_name, auto input_or_output_scales = op_info->GetAttr>(attr_name); auto size = input_or_output_names.size(); CHECK(size == input_or_output_scales.size()); - for (int i = 0; i < size; i++) { + for (size_t i = 0; i < size; i++) { if (input_or_output_names[i] == var_name) { *scale = input_or_output_scales[i]; return true; @@ -137,18 +137,23 @@ void PrecisionCastPass::Apply(const std::unique_ptr& graph) { nodes.push_back(node); } + // record the copied node. + std::unordered_map cast_nodes; + for (auto& node : nodes) { if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; auto inlinks = node->inlinks; for (auto* in : inlinks) { - ComplementInputs(graph.get(), node, in); + ComplementInputs(graph.get(), node, in, &cast_nodes); } } } -void PrecisionCastPass::ComplementInputs(SSAGraph* graph, - Node* inst_node, - Node* in) { +void PrecisionCastPass::ComplementInputs( + SSAGraph* graph, + Node* inst_node, + Node* in, + std::unordered_map* cast_nodes) { // If this input is out of date. if (inst_node->inlinks.end() == std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in)) @@ -184,16 +189,19 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph, in, graph, inst_node, + cast_nodes, graph->valid_places()); } } -void PrecisionCastPass::AddCastInst(const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places) { +void PrecisionCastPass::AddCastInst( + const Type& from, + const Type& to, + Node* in, + SSAGraph* graph, + Node* inst_node, + std::unordered_map* cast_nodes, + const std::vector& valid_places) { CHECK(!valid_places.empty()) << "valid_place should be set"; // var -> new_transform_op -> new_var -> inst @@ -203,66 +211,80 @@ void PrecisionCastPass::AddCastInst(const Type& from, auto cast_op_output_name = in->AsArg().name + "/precision_trans"; // in->AsArg().name + "/precision_trans/" + // paddle::lite::to_string(node_id()); - auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); - cast_op_output_arg->AsArg().type = - LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); - auto* cast_inst = graph->NewInstructNode(); + if (cast_nodes->count(in->AsArg().name)) { + // Remove the old link + RemoveDirectedLink(in, inst_node); + // Update the original instruction OpDesc. + // Update its input to the cast_op_output_name + // Add new link, newarg->inst + DirectedLink(cast_nodes->at(in->AsArg().name), + inst_node); // [io_copy kernel]'s output -> [current kernel] + // reset opdesc and update kernel information + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); + } else { + auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); + cast_op_output_arg->AsArg().type = + LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); + auto* cast_inst = graph->NewInstructNode(); - // create Op and kernels. - bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; - std::string cast_type = in_persist ? "calib_once" : "calib"; - cast_op_output_arg->AsArg().is_persist = in_persist; - auto cast_op = LiteOpRegistry::Global().Create(cast_type); - CHECK(cast_op) << "create op [" << cast_op << "] failed"; + // create Op and kernels. + bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; + std::string cast_type = in_persist ? "calib_once" : "calib"; + cast_op_output_arg->AsArg().is_persist = in_persist; + auto cast_op = LiteOpRegistry::Global().Create(cast_type); + CHECK(cast_op) << "create op [" << cast_op << "] failed"; - // Create the new var manually. - inst_node->AsStmt().op()->scope()->Var(cast_op_output_name); + // Create the new var manually. + inst_node->AsStmt().op()->scope()->Var(cast_op_output_name); - // Create Calib Instruction. - cpp::OpDesc op_desc; - op_desc.SetType(cast_type); - op_desc.SetInput("Input", {in->AsArg().name}); - op_desc.SetOutput("Out", {cast_op_output_name}); - float scale; - if (InferScale(in, inst_node, &scale)) { - op_desc.SetAttr("scale", scale); - } + // Create Calib Instruction. + cpp::OpDesc op_desc; + op_desc.SetType(cast_type); + op_desc.SetInput("Input", {in->AsArg().name}); + op_desc.SetOutput("Out", {cast_op_output_name}); + float scale; + if (InferScale(in, inst_node, &scale)) { + op_desc.SetAttr("scale", scale); + } - cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); - auto kernels = cast_op->CreateKernels(valid_places); - std::vector> selected_kernels; - bool is_found = false; - for (auto& kernel : kernels) { - const Type* in_arg_ty = kernel->GetInputDeclType("Input"); - const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->precision() == to.precision()) { - is_found = true; - selected_kernels.emplace_back(std::move(kernel)); - // we pick the kernel - cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op); - break; + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + auto kernels = cast_op->CreateKernels(valid_places); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->precision() == to.precision()) { + is_found = true; + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op); + (*cast_nodes)[in->AsArg().name] = cast_op_output_arg; + break; + } } - } - CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":" - << in->AsArg().name << "->" << to << ":" - << inst_node->AsStmt().op_info()->Type(); + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":" + << in->AsArg().name << "->" << to << ":" + << inst_node->AsStmt().op_info()->Type(); - // Remove the old link - RemoveDirectedLink(in, inst_node); + // Remove the old link + RemoveDirectedLink(in, inst_node); - // Update the original instruction OpDesc. - // Update its input to the io_copy_output_name + // Update the original instruction OpDesc. + // Update its input to the io_copy_output_name - // Add new link, var -> new_inst, new_inst->newarg, newarg->inst - DirectedLink(in, cast_inst); - DirectedLink(cast_inst, cast_op_output_arg); - DirectedLink(cast_op_output_arg, inst_node); + // Add new link, var -> new_inst, new_inst->newarg, newarg->inst + DirectedLink(in, cast_inst); + DirectedLink(cast_inst, cast_op_output_arg); + DirectedLink(cast_op_output_arg, inst_node); - // reset opdesc and update kernel information - UpdateInputs( - inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); + // reset opdesc and update kernel information + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); + } // recreate the op auto original_selected_kernel = diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h index b5f7c5d902a998e369f0b1775c59f50cbf8dc256..d8d6af5fcd06c187029c7c16a74efade0d4bd5ca 100644 --- a/lite/core/mir/type_precision_cast_pass.h +++ b/lite/core/mir/type_precision_cast_pass.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "lite/core/mir/pass.h" #include "lite/core/op_registry.h" @@ -34,13 +35,17 @@ class PrecisionCastPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; - void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in); + void ComplementInputs(SSAGraph* graph, + Node* inst_node, + Node* in, + std::unordered_map* cast_nodes); void AddCastInst(const Type& from, const Type& to, Node* in, SSAGraph* graph, Node* inst_node, + std::unordered_map* cast_nodes, const std::vector& valid_places); void SetValidPlaces(const std::vector& valid_places); diff --git a/lite/core/mir/type_target_cast_pass.cc b/lite/core/mir/type_target_cast_pass.cc index 75d8022d5f5f9d8572a5e020c11ae5d8cf630c10..aca7343c8af39f767c2a336e0b298995731b755f 100644 --- a/lite/core/mir/type_target_cast_pass.cc +++ b/lite/core/mir/type_target_cast_pass.cc @@ -180,7 +180,7 @@ void TypeTargetTransformPass::AddIoCopyInst( VLOG(4) << "picked, opencl found"; is_found = true; } else if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->target() == to.target()) { + TargetCompatibleTo(*out_arg_ty, to)) { VLOG(4) << "picked"; is_found = true; } diff --git a/lite/core/mir/weight_quantization_preprocess_pass.cc b/lite/core/mir/weight_quantization_preprocess_pass.cc index c7889a54903f2a1d194fb3eade0bd92670b36699..2bb247871b9500129eeea855677a907cb4fd88b9 100644 --- a/lite/core/mir/weight_quantization_preprocess_pass.cc +++ b/lite/core/mir/weight_quantization_preprocess_pass.cc @@ -22,9 +22,29 @@ namespace paddle { namespace lite { namespace mir { +bool IsAbsMaxQuantizedOp(const OpInfo& op_info) { + bool result = false; + if (op_info.HasAttr("quantization_type") && + op_info.GetAttr("quantization_type") == + "post_weight_abs_max") { + result = true; + } else if (!op_info.HasAttr("quantization_type") && + op_info.HasAttr("quantize_weight_bits")) { // Support older model, + // save this for now + result = true; + } + return result; +} + +/* + * For abs_max method in WeightQuantization, this pass obtains the scale value + * of conv2d, depthwise_conv2d and mul, expands the scale list, and save the + * list in the quantized ops. +*/ void WeightQuantizationPreprocessPass::Apply( const std::unique_ptr& graph) { - std::vector weight_quantized_op = {"conv2d", "depthwise_conv2d"}; + std::vector weight_quantized_op = { + "conv2d", "depthwise_conv2d", "mul"}; for (auto& node : graph->StmtTopologicalOrder()) { if (node->IsStmt() && std::find(weight_quantized_op.begin(), @@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply( node->AsStmt().op_type()) != weight_quantized_op.end()) { auto* scope = node->stmt()->op()->scope(); auto* op_desc = node->stmt()->mutable_op_info(); - if (op_desc->HasAttr("quantize_weight_bits")) { + if (IsAbsMaxQuantizedOp(*op_desc)) { for (auto& input_name : op_desc->input_vars()) { std::string scale_name = input_name + "_quant_scale"; if (op_desc->HasAttr(scale_name)) { - VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name; + VLOG(0) << " WeightQuantizationPreprocessPass op:" + << op_desc->Type() << " input_name:" << input_name; auto input_tensor = scope->FindVar(input_name)->GetMutable(); - int weight_out_channel = static_cast(input_tensor->dims()[0]); + int weight_out_channel; + if (op_desc->Type() == "mul") { + weight_out_channel = static_cast(input_tensor->dims()[1]); + } else { + weight_out_channel = static_cast(input_tensor->dims()[0]); + } auto input_scale = op_desc->GetAttr>(scale_name); // scale length is equal to weight out channel std::vector scale_list(weight_out_channel, input_scale[0]); diff --git a/lite/core/mir/weight_quantization_preprocess_pass.h b/lite/core/mir/weight_quantization_preprocess_pass.h index 76a35c6b443c692ec08688abd4c10680be62b8af..e7c9f03eef78bdafea204d30c78cf0d044bb15e9 100644 --- a/lite/core/mir/weight_quantization_preprocess_pass.h +++ b/lite/core/mir/weight_quantization_preprocess_pass.h @@ -25,8 +25,9 @@ namespace mir { * If the model is quantized by WeightQuantization in PostTrainingQuantization, * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is * int, and the scale is save in the quantized ops. - * WeightQuantizationPreprocessPass obtains the scale value, expands the - * scale value to a list, and save the list in the quantized ops. + * For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass + * obtains the scale value of conv2d, depthwise_conv2d and mul, expands the + * scale list, and save the list in the quantized ops. */ class WeightQuantizationPreprocessPass : public ProgramPass { public: diff --git a/lite/core/mir/xpu_pattern_matcher.cc b/lite/core/mir/xpu_pattern_matcher.cc new file mode 100644 index 0000000000000000000000000000000000000000..0f268e7af8a55d22163d52c7f8824406f58bd17b --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher.cc @@ -0,0 +1,271 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include "lite/core/mir/dot.h" +#include "lite/core/mir/xpu_pattern_matcher.h" +#include "lite/core/op_lite.h" +#include "lite/utils/string.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +void XPUPatternMatcher::operator()(SSAGraph *graph, + XPUPatternMatcher::handle_t handler) { + if (!MarkPMNodesInGraph(graph)) { + return; + } + + auto subgraphs = DetectPatterns(); + UniquePatterns(&subgraphs); + RemoveOverlappedMatch(&subgraphs); + ValidateByNodeRole(&subgraphs); + + if (subgraphs.empty()) return; + LOG(INFO) << "detected " << subgraphs.size() << " subgraph"; + int id = 0; + for (auto &g : subgraphs) { + VLOG(3) << "optimizing #" << id++ << " subgraph"; + handler(g, graph); + } +} + +bool XPUPatternMatcher::MarkPMNodesInGraph(SSAGraph *graph) { + VLOG(3) << "mark pmnodes in graph"; + if (graph->nodes().empty()) return false; + for (auto &node : graph->mutable_nodes()) { + for (const auto &pmnode : pattern_.nodes()) { + if (pmnode->Tell(&node)) { + pmnodes2nodes_[pmnode.get()].insert(&node); + } + } + } + // Check to early stop if some PMNode can't find matched Node. + for (auto &pmnode : pattern_.nodes()) { + if (!pmnodes2nodes_.count(pmnode.get())) { + VLOG(4) << pmnode->name() << " can't find matched Node, early stop"; + // return false; + } + } + VLOG(3) << pmnodes2nodes_.size() << " nodes marked"; + + return !pmnodes2nodes_.empty(); +} + +// The intermediate Nodes can only link to the nodes inside the pattern, or this +// subgraph will be droped. +void XPUPatternMatcher::ValidateByNodeRole( + std::vector *subgraphs) { + subgraphs->erase( + std::remove_if(subgraphs->begin(), + subgraphs->end(), + [](const XPUPatternMatcher::subgraph_t &subgraph) -> bool { + // Collect the inlinks and outlinks. + std::unordered_set ios; + for (auto &item : subgraph) { + ios.insert(item.second); + } + for (auto &item : subgraph) { + if (item.first->IsIntermediate()) { + for (auto *x : item.second->outlinks) { + if (!ios.count(x)) { + return true; + } + } + } + } + return false; + }), + subgraphs->end()); + + for (auto &subgraph : *subgraphs) { + std::unordered_set ios; + for (auto &item : subgraph) { + ios.insert(item.second); + } + extra_input_vars_.emplace_back(); + for (auto &item : subgraph) { + for (auto *x : item.second->inlinks) { + if (x->IsArg() && ios.count(x) == 0) { + // extra weight var + extra_input_vars_.back().push_back(x); + } + } + } + } +} + +struct HitGroup { + std::unordered_map roles; + + bool Match(Node *node, PMNode *pat) { + if (nodes_.count(node)) { + if (roles.count(pat) && roles[pat] == node) return true; + return false; + } else { + if (roles.count(pat) && roles[pat] != node) return false; + return true; + } + } + + void Register(Node *node, PMNode *pat) { + roles[pat] = node; + nodes_.insert(node); + } + + private: + std::unordered_set nodes_; +}; + +// Tell whether Node a links to b. +bool IsNodesLink(Node *a, Node *b) { + for (auto *node : a->outlinks) { + if (b == node) { + return true; + } + } + return false; +} + +std::vector XPUPatternMatcher::DetectPatterns() { + // Init empty subgraphs. + std::vector result; + std::vector init_groups; + std::array, 2> bi_records; + auto *first_pnode = pattern_.edges().empty() ? pattern().nodes().front().get() + : pattern_.edges().front().first; + if (!pmnodes2nodes_.count(first_pnode)) return result; + for (auto *node : pmnodes2nodes_[first_pnode]) { + HitGroup group; + group.roles[first_pnode] = node; + init_groups.emplace_back(group); + } + + int step = 0; + bi_records[0] = std::move(init_groups); + + // Extend a PMNode to subgraphs by deducing the connection relations defined + // in edges of PMNodes. + for (const auto &edge : pattern_.edges()) { + VLOG(4) << "check " << edge.first->name() << " -> " << edge.second->name(); + // TODO(Superjomn) Fix bug here, the groups might be duplicate here. + // Each role has two PMNodes, which indicates two roles. + // Detect two Nodes that can match these two roles and they are connected. + auto &pre_groups = bi_records[step % 2]; + auto &cur_groups = bi_records[1 - (step++ % 2)]; + cur_groups.clear(); + if (pre_groups.empty()) break; + // source -> target + for (Node *source : pmnodes2nodes_[edge.first]) { + for (Node *target : pmnodes2nodes_[edge.second]) { + // TODO(Superjomn) add some prune strategies. + for (const auto &group : pre_groups) { + if (IsNodesLink(source, target)) { + HitGroup new_group = group; + bool flag = new_group.Match(source, edge.first) && + new_group.Match(target, edge.second); + if (flag) { + new_group.Register(source, edge.first); + new_group.Register(target, edge.second); + cur_groups.push_back(new_group); + // TODO(Superjomn) need to unique + } + } + } + } + } + VLOG(3) << "step " << step << " get records: " << cur_groups.size(); + } + + for (auto &group : bi_records[step % 2]) { + XPUPatternMatcher::subgraph_t subgraph; + for (auto &role : group.roles) { + subgraph.emplace(role.first, role.second); + } + result.emplace_back(subgraph); + } + return result; +} + +struct GraphItemLessThan { + bool operator()(const std::pair &a, + const std::pair &b) { + if (a.first != b.first) { + return a.first < b.first; + } else { + return a.second < b.second; + } + } +}; + +// TODO(Superjomn) enhance the function as it marks unique unique as duplicates +// see https://github.com/PaddlePaddle/Paddle/issues/13550 +void XPUPatternMatcher::UniquePatterns( + std::vector *subgraphs) { + if (subgraphs->empty()) return; + std::vector result; + + std::unordered_set set; + std::hash hasher; + for (auto &g : *subgraphs) { + // Sort the items in the sub-graph, and transform to a string key. + std::vector> sorted_keys(g.begin(), g.end()); + std::sort(sorted_keys.begin(), sorted_keys.end(), GraphItemLessThan()); + STL::stringstream ss; + for (auto &item : sorted_keys) { + ss << reinterpret_cast(item.first) << ":" + << reinterpret_cast(item.second); + } + auto key = hasher(ss.str()); + if (!set.count(key)) { + result.emplace_back(g); + set.insert(key); + } + } + *subgraphs = result; +} + +void XPUPatternMatcher::RemoveOverlappedMatch( + std::vector *subgraphs) { + std::vector result; + std::unordered_set node_set; + + for (const auto &subgraph : *subgraphs) { + bool valid = true; + for (auto &item : subgraph) { + if (item.first->IsIntermediate() && node_set.count(item.second)) { + valid = false; + break; + } + } + if (valid) { + for (auto &item : subgraph) { + node_set.insert(item.second); + } + result.push_back(subgraph); + } + } + *subgraphs = result; +} + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher.h b/lite/core/mir/xpu_pattern_matcher.h new file mode 100644 index 0000000000000000000000000000000000000000..4ac03718f32a859ff6888e63e57fd4098e435e06 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "lite/core/mir/pattern_matcher.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +/* + * PatternMatcher helps to detect the specific patterns in the graph. + * Input a pattern, output a list of the matched subgraphs/nodes. + * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.). + * + * The algorithm has three phases: + * 1. Mark the nodes that match the defined PMNodes in a PMPattern, + * 2. Extend a PMNode to subgraphs by deducing the connection relation defined + * in PAPattern(the edges), + * 3. Get the filtered subgraphs and treat them with a pre-defined handler. + * + * Usage: + * // Create a matcher + * PatternMatcher matcher; + * // Define the matcher's pattern, by adding PMNode and define the edges. + * auto* node0 = matcher.mutable_pattern().AddNode(...) + * auto* node1 = matcher.mutable_pattern().AddNode(...) + * node0->teller = some lambda. + * node1->teller = some lambda. + * matcher.mutable_pattern().AddEdge(node0, node1); + * // Create an handler, to define the behavior of treating the filtered + * // subgraphs that comply with the patterns. + * PatternMatcher::handle_t handler = some labmda + * // Execute the matcher. + * matcher(&graph, handler); + */ +struct XPUPatternMatcher { + using subgraph_t = std::unordered_map; + + // Operate on the detected pattern. + using handle_t = + std::function; + + void operator()(SSAGraph* graph, handle_t handler); + + const PMPattern& pattern() const { return pattern_; } + PMPattern* mutable_pattern() { return &pattern_; } + + // Mark the nodes that fits the pattern. + bool MarkPMNodesInGraph(SSAGraph* graph); + + // Detect all the pattern and output the hit records. + std::vector DetectPatterns(); + + // Remove duplicate patterns. + void UniquePatterns(std::vector* subgraphs); + + // Remove overlapped match subgraphs, when overlapped, keep the previous one. + // The intermediate PMNodes will be removed, so can't shared by multiple + // patterns. + void RemoveOverlappedMatch(std::vector* subgraphs); + + // Validate whether the intermediate nodes are linked by external nodes. + void ValidateByNodeRole(std::vector* subgraphs); + + using hit_rcd_t = + std::pair; + PMPattern pattern_; + std::unordered_map> pmnodes2nodes_; + std::vector> extra_input_vars_; +}; + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.cc b/lite/core/mir/xpu_pattern_matcher_high_api.cc new file mode 100644 index 0000000000000000000000000000000000000000..5ffc496d1593d15f02d82e824c06443e7b3e01c9 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher_high_api.cc @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/xpu_pattern_matcher_high_api.h" +#include +#include +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +void XPUFuseBase::PerformPatternMatcher(SSAGraph *graph) { + VLOG(4) << "\n" << matcher_.pattern().DotString(); + // Get subgraphs and record the mir::Node pointers for each PMNode. + auto handler = [&](const PatternMatcher::subgraph_t &subgraph, SSAGraph *g) { + // get all the reigistered nodes. + key2nodes_.emplace_back(); + for (auto &item : nodes_) { + key2nodes_.back()[item.first] = subgraph.at(item.second); + } + }; + + matcher_(graph, handler); +} + +void XPUFuseBase::DeleteInterNodes(SSAGraph *graph) { + std::set keys; + for (auto &node : nodes_) { + if (node.second->IsIntermediate()) { + keys.insert(node.first); + } + } + + VLOG(4) << "keys: " << key2nodes_.size(); + std::unordered_set nodes2rm; + for (auto &matched : key2nodes_) { + for (const auto &key : keys) { + nodes2rm.insert(matched.at(key)); + } + } + + VLOG(3) << "clean nodes " << nodes2rm.size(); + GraphSafeRemoveNodes(graph, nodes2rm); +} + +PMNode *XPUFuseBase::GetOrCreateNode(const std::string &key) { + auto it = nodes_.find(key); + if (it != nodes_.end()) { + return it->second; + } + nodes_.emplace(key, + matcher_.mutable_pattern()->NewNode(patterns::UniqueKey(key))); + it = nodes_.find(key); + return it->second; +} + +PMNode *XPUFuseBase::OpNode(const std::string &key, + const std::string &op_type) { + GetOrCreateNode(key)->set_op_type(op_type); + GetOrCreateNode(key)->AsOp(op_type); + return GetOrCreateNode(key); +} + +PMNode *XPUFuseBase::VarNode(const std::string &key) { + GetOrCreateNode(key)->AsVar(); + return GetOrCreateNode(key); +} + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/xpu_pattern_matcher_high_api.h b/lite/core/mir/xpu_pattern_matcher_high_api.h new file mode 100644 index 0000000000000000000000000000000000000000..3302bcb6137f16afcf82269af91c8a13558da2b9 --- /dev/null +++ b/lite/core/mir/xpu_pattern_matcher_high_api.h @@ -0,0 +1,82 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/core/mir/pattern_matcher_high_api.h" +#include "lite/core/mir/xpu_pattern_matcher.h" + +namespace paddle { +namespace lite { +namespace mir { +namespace xpu { + +class XPUFuseBase { + public: + using key2nodes_t = std::map; + + virtual ~XPUFuseBase() = default; + + void operator()(SSAGraph* graph) { + BuildPattern(); + PerformPatternMatcher(graph); + + for (size_t i = 0; i < key2nodes_.size(); ++i) { + InsertNewNode(graph, key2nodes_[i], matcher_.extra_input_vars_[i]); + } + + DeleteInterNodes(graph); + } + + // Build a PMPattern using PMNode. + virtual void BuildPattern() = 0; + + // Generate an operator desc with a matched subgraph. + virtual cpp::OpDesc GenOpDesc(const key2nodes_t& matched) { + return cpp::OpDesc(); + } + + PMNode* OpNode(const std::string& key) { + return GetOrCreateNode(key)->assert_is_op(); + } + + PMNode* OpNode(const std::string& key, const std::string& op_type); + + PMNode* VarNode(const std::string& key); + + protected: + virtual void InsertNewNode(SSAGraph* graph, + const key2nodes_t& matched, + const std::vector& extra_input_vars) = 0; + + void PerformPatternMatcher(SSAGraph* graph); + + // Delete nodes that are marked as Intermediate + void DeleteInterNodes(SSAGraph* graph); + + PMNode* GetOrCreateNode(const std::string& key); + + protected: + XPUPatternMatcher matcher_; + std::map nodes_; + std::vector key2nodes_; +}; + +} // namespace xpu +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index c76e369466a9b998b2ad6fde67b97117649fddc0..f8a706179374a0c86e28cf9a3638f5df2c932540 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -22,6 +22,61 @@ namespace paddle { namespace lite { +bool OpLite::InferShape() { + // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_ + // InferShapeByMemoryInternal will be applied. + if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) { + return this->InferShapeWithCache(); + } else { + // otherwise, InferShapeImpl is applied directly. + return this->InferShapeImpl(); + } +} +bool OpLite::InferShapeWithCache() { + // 1. Get vector of current input tensors + auto *current_inputs = param_.input_tensor_ptrs(); + // 2. Get hash value of current inputs shape and lod + size_t new_hash = 0; + for (auto iter = current_inputs->begin(); iter != current_inputs->end(); + iter++) { + // combined dims value into new_hash value. + auto &element_dims = (*iter)->dims(); + for (int i = 0; i < element_dims.size(); i++) { + new_hash = + lite::hash_combine(new_hash, static_cast(element_dims[i])); + } + // combine lod value into new_hash valud. + auto &emement_lods = (*iter)->lod(); + for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end(); + lod_iter++) { + for (int i = 0; i < lod_iter->size(); i++) { + new_hash = + lite::hash_combine(new_hash, static_cast(lod_iter->at(i))); + } + } + } + // 3. infer shapes of output tensors + if (new_hash == io_shape_lod_hash_ && new_hash != 0) { + // if current hash value is consistent with io_shape_lod_hash_, + // previous outputs shape and lod are reused. + auto *current_outputs = param_.output_tensor_ptrs(); + for (int i = 0; i < current_outputs->size(); i++) { + current_outputs->at(i)->Resize(last_output_shapes[i]); + current_outputs->at(i)->set_lod(last_output_lods[i]); + } + } else { + // otherwise, current hash value is changed, InferShapeImpl will apply. + io_shape_lod_hash_ = new_hash; + this->InferShapeImpl(); + auto *current_outputs = param_.output_tensor_ptrs(); + for (int i = 0; i < current_outputs->size(); i++) { + last_output_shapes[i] = current_outputs->at(i)->dims(); + last_output_lods[i] = current_outputs->at(i)->lod(); + } + } + return true; +} + std::vector> OpLite::CreateKernels( const std::vector &places, const std::string &kernel_type) { std::vector> kernels; @@ -102,5 +157,33 @@ Tensor *OpLite::GetMutableTensor(lite::Scope *scope, return var->GetMutable(); } +void OpLite::AttachInput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &input_name, + bool is_dispensable, + lite::Tensor **input_var) { + bool is_have_input = + op_desc.HasInput(input_name) && op_desc.Input(input_name).size() > 0; + CHECK(is_dispensable || is_have_input); + if (is_have_input) { + std::string input_var_name = op_desc.Input(input_name).front(); + *input_var = scope->FindVar(input_var_name)->GetMutable(); + } +} + +void OpLite::AttachOutput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &output_name, + bool is_dispensable, + lite::Tensor **output_var) { + bool is_have_output = + op_desc.HasOutput(output_name) && op_desc.Output(output_name).size() > 0; + CHECK(is_dispensable || is_have_output); + if (is_have_output) { + std::string output_var_name = op_desc.Output(output_name).front(); + *output_var = scope->FindVar(output_var_name)->GetMutable(); + } +} + } // namespace lite } // namespace paddle diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 77d8091b4b16cfbce2efc3d549f916a9136c61ab..428b188c468ded790e74c9cc4f5da5c7efe2fd00 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,6 +25,7 @@ #include "lite/core/kernel.h" #include "lite/core/scope.h" #include "lite/model_parser/cpp/op_desc.h" +#include "lite/operators/op_params.h" namespace paddle { namespace lite { @@ -64,8 +66,8 @@ class OpLite : public Registry { // Check the shape. virtual bool CheckShape() const { return true; } // Inference the outputs' shape. - virtual bool InferShape() const { return true; } - virtual bool SmartInferShape() { return this->InferShape(); } + virtual bool InferShapeImpl() const { return true; } + virtual bool InferShape(); // Run this operator. virtual bool Run(); // Indicate whether the Op runs only once or not @@ -103,6 +105,20 @@ class OpLite : public Registry { return kernel_.get(); } + // Attach input variable from scope by op_desc and input name + void AttachInput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &input_name, + bool is_dispensable, + lite::Tensor **input_var); + + // Attach output variable from scope by op_desc and output name + void AttachOutput(const cpp::OpDesc &op_desc, + lite::Scope *scope, + const std::string &output_name, + bool is_dispensable, + lite::Tensor **output_var); + virtual ~OpLite() = default; protected: @@ -151,10 +167,16 @@ class OpLite : public Registry { std::vector valid_places_; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; std::unique_ptr op_info_; - std::vector last_input_shapes; - std::vector last_output_shapes; - std::vector>> last_output_lods; - std::vector>> last_input_lods; + + std::vector last_output_shapes{}; + std::vector>> last_output_lods{}; + size_t io_shape_lod_hash_{}; + mutable operators::ParamBase param_; + + private: + // Infer Shape according to memory, if current input shapes are consistent + // with that of previous inputs, output shapes of last time will be reused. + bool InferShapeWithCache(); }; /* @@ -217,6 +239,32 @@ class OpInfo : public cpp::OpDesc { return false; } + // For the input variable name, find the index of the corresponding + // input argname + bool GetInputIndex(const std::string &value_name, int *out) const { + for (auto &item : inputs_) { + auto it = std::find(item.second.begin(), item.second.end(), value_name); + if (it != item.second.end()) { + *out = it - item.second.begin(); + return true; + } + } + return false; + } + + // For the output variable name, find the index of the corresponding + // output argname + bool GetOutputIndex(const std::string &value_name, int *out) const { + for (auto &item : outputs_) { + auto it = std::find(item.second.begin(), item.second.end(), value_name); + if (it != item.second.end()) { + *out = it - item.second.begin(); + return true; + } + } + return false; + } + void UpdateAllInputs(const std::string &from, const std::string &to) { for (auto &item : inputs_) { for (auto &var : item.second) { diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index fe1dff3c99c1d2413888e78c89c999caea0ab030..0c8d42f4e2dc0b0a32d352ed9b460e1a0b7bfb90 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -110,6 +110,9 @@ std::list> KernelRegistry::Create( case TARGET(kMLU): { CREATE_KERNEL(kMLU); } break; + case TARGET(kRKNPU): { + CREATE_KERNEL(kRKNPU); + } break; default: CHECK(false) << "not supported kernel target " << TargetToStr(target); } @@ -151,14 +154,30 @@ KernelRegistry::KernelRegistry() INIT_FOR(kMLU, kInt16, kNHWC); INIT_FOR(kMLU, kInt16, kNCHW); - INIT_FOR(kHost, kFloat, kNCHW); INIT_FOR(kHost, kAny, kNCHW); - INIT_FOR(kHost, kFloat, kNHWC); - INIT_FOR(kHost, kFloat, kAny); - INIT_FOR(kHost, kAny, kNHWC); - INIT_FOR(kHost, kAny, kAny); INIT_FOR(kHost, kAny, kNHWC); INIT_FOR(kHost, kAny, kAny); + INIT_FOR(kHost, kBool, kNCHW); + INIT_FOR(kHost, kBool, kNHWC); + INIT_FOR(kHost, kBool, kAny); + INIT_FOR(kHost, kFloat, kNCHW); + INIT_FOR(kHost, kFloat, kNHWC); + INIT_FOR(kHost, kFloat, kAny); + INIT_FOR(kHost, kFP16, kNCHW); + INIT_FOR(kHost, kFP16, kNHWC); + INIT_FOR(kHost, kFP16, kAny); + INIT_FOR(kHost, kInt8, kNCHW); + INIT_FOR(kHost, kInt8, kNHWC); + INIT_FOR(kHost, kInt8, kAny); + INIT_FOR(kHost, kInt16, kNCHW); + INIT_FOR(kHost, kInt16, kNHWC); + INIT_FOR(kHost, kInt16, kAny); + INIT_FOR(kHost, kInt32, kNCHW); + INIT_FOR(kHost, kInt32, kNHWC); + INIT_FOR(kHost, kInt32, kAny); + INIT_FOR(kHost, kInt64, kNCHW); + INIT_FOR(kHost, kInt64, kNHWC); + INIT_FOR(kHost, kInt64, kAny); INIT_FOR(kX86, kFloat, kNCHW); INIT_FOR(kX86, kAny, kNCHW); @@ -216,6 +235,11 @@ KernelRegistry::KernelRegistry() INIT_FOR(kBM, kInt8, kNCHW); INIT_FOR(kBM, kAny, kNCHW); INIT_FOR(kBM, kAny, kAny); + + INIT_FOR(kRKNPU, kFloat, kNCHW); + INIT_FOR(kRKNPU, kInt8, kNCHW); + INIT_FOR(kRKNPU, kAny, kNCHW); + INIT_FOR(kRKNPU, kAny, kAny); #undef INIT_FOR } diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 3c41c1fd8af240401c3edf0343433f8d8d9c85db..65279b74c5149f1c73cb42d57b5f47f608f38de1 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -135,6 +135,12 @@ class KernelRegistry final { KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // @@ -245,6 +251,16 @@ class KernelRegistry final { PRECISION(kInt8), DATALAYOUT(kNCHW)> *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // @@ -429,32 +445,31 @@ class KernelRegistor : public lite::Registor { #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \ LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__) -#define REGISTER_LITE_KERNEL( \ - op_type__, target__, precision__, layout__, KernelClass, alias__) \ - static paddle::lite::KernelRegistor \ - LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__)(#op_type__, \ - #alias__); \ - static KernelClass LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__); \ - int touch_##op_type__##target__##precision__##layout__##alias__() { \ - OpKernelInfoCollector::Global().AddKernel2path( \ - #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ - __FILE__); \ - LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ - .Touch(); \ - return 0; \ - } \ - static bool LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - __attribute__((unused)) = \ - paddle::lite::ParamTypeRegistry::NewInstance( \ - #op_type__ "/" #alias__) +#define REGISTER_LITE_KERNEL( \ + op_type__, target__, precision__, layout__, KernelClass, alias__) \ + static paddle::lite::KernelRegistor \ + LITE_KERNEL_REGISTER_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__)(#op_type__, \ + #alias__); \ + static KernelClass LITE_KERNEL_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__); \ + int touch_##op_type__##target__##precision__##layout__##alias__() { \ + OpKernelInfoCollector::Global().AddKernel2path( \ + #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ + __FILE__); \ + LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ + .Touch(); \ + return 0; \ + } \ + static bool LITE_KERNEL_PARAM_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__) UNUSED = \ + paddle::lite::ParamTypeRegistry::NewInstance( \ + #op_type__ "/" #alias__) #define LITE_KERNEL_INSTANCE( \ op_type__, target__, precision__, layout__, alias__) \ diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 4348f9eeaa592d263698eb164b29db8126a17698..2fb27996823cb7f9fdb842b668ca93da0941cdb1 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -78,6 +78,8 @@ class Optimizer { (defined LITE_WITH_ARM) "lite_elementwise_add_activation_fuse_pass", // #endif + "__xpu__resnet_fuse_pass", + "__xpu__multi_encoder_fuse_pass", "quantized_op_attributes_inference_pass", // Only for fully // quantized model, infer // the output scale and @@ -87,6 +89,7 @@ class Optimizer { "npu_subgraph_pass", "xpu_subgraph_pass", "bm_subgraph_pass", + "rknpu_subgraph_pass", "static_kernel_pick_pass", // pick original kernel from graph "variable_place_inference_pass", // inference arg/var's // info(target/precision/layout/device) @@ -128,7 +131,21 @@ class Optimizer { "memory_optimize_pass"}}; if (passes.size() == 1) { - passes_local.push_back(passes[0]); + // multi_stream_analysis_pass must be in the front of + // runtime_context_assign_pass + const std::string msa_pass{"multi_stream_analysis_pass"}; + const std::string depend_pass{"runtime_context_assign_pass"}; + if (passes[0] == msa_pass) { + auto iter = + std::find(passes_local.begin(), passes_local.end(), depend_pass); + if (iter != passes_local.end()) { + passes_local.insert(iter, msa_pass); + } else { + CHECK(false) << "Not find " << depend_pass; + } + } else { + passes_local.push_back(passes[0]); + } } RunPasses(passes_local); } else { diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h index 39213a33cebd05d9cfa50d82cdfb09ad3f7ad637..ee581bf5e126f07fcdb1edeb9ab5b570df0c2ade 100644 --- a/lite/core/profile/precision_profiler.h +++ b/lite/core/profile/precision_profiler.h @@ -18,6 +18,7 @@ * of each kernel. */ #pragma once +#include #include #include #include "lite/core/program.h" @@ -177,6 +178,13 @@ class PrecisionProfiler { write_result_to_file&& write_tensorfile(in, name); return; } + case PRECISION(kInt64): { + auto ptr = in->data(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = compute_standard_deviation( + ptr, in->numel(), true, *mean); + return; + } default: *mean = -333333333333; *std_dev = -33333333333; diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc index f4d0e3c0afbe1f9df4e381a502e1800a3d58ba68..3906cf0989a11c079323bdc8f256e6b5a5a33394 100644 --- a/lite/core/profile/profiler.cc +++ b/lite/core/profile/profiler.cc @@ -100,7 +100,8 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { << " " << setw(12) << left << "Avg (ms)" << " " << setw(12) << left << "Min (ms)" << " " << setw(12) << left << "Max (ms)" - << " " << setw(12) << left << "Last (ms)" << std::endl; + << " " << setw(12) << left << "Last (ms)" + << " " << setw(12) << left << "Percent (%)" << std::endl; // Profile information. if (concise) { std::map summary(op_comp); @@ -117,7 +118,16 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { summary.insert({unit.Character(), info}); } } + // compute total time + float total = 0.0; for (const auto& item : summary) { + total += item.second.avg; + } + for (const auto& item : summary) { + float percent = 0; + if (total > 0) { + percent = 100 * (item.second.avg / total); + } // clang-format off ss << setw(25) << left << fixed << item.first.op_type \ << " " << setw(40) << left << fixed << item.first.kernel_name \ @@ -125,12 +135,23 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { << " " << setw(12) << left << fixed << item.second.avg \ << " " << setw(12) << left << fixed << item.second.min \ << " " << setw(12) << left << fixed << item.second.max \ + << " " << setw(12) << left << fixed << percent << "%" \ << " " << std::endl; // clang-format on } } else { + float total = 0.0; for (auto& unit : units_) { const auto& times = unit.Timer(type)->LapTimes(); + total += times.Avg(w); + } + for (auto& unit : units_) { + const auto& times = unit.Timer(type)->LapTimes(); + float run = times.Avg(w); + float percent = 0; + if (total > 0) { + percent = 100 * (run / total); + } // clang-format off ss << setw(25) << left << fixed << unit.Character().op_type \ << " " << setw(40) << left << fixed << unit.Character().kernel_name \ @@ -139,6 +160,7 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { << " " << setw(12) << left << fixed << times.Min(w) \ << " " << setw(12) << left << fixed << times.Max(w) \ << " " << setw(12) << left << fixed << times.Last(w) \ + << " " << setw(12) << left << fixed << percent << "%" \ << std::endl; // clang-format on } diff --git a/lite/core/program.cc b/lite/core/program.cc index 580389fbad54c0de8efd65ef78c9b69fd3e72893..1193e3c84f66b9d1dfb39d5dcc74265d212ab7ab 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -20,7 +20,7 @@ #include "lite/operators/conditional_block_op.h" #include "lite/operators/subgraph_op.h" #include "lite/operators/while_op.h" -#ifdef LITE_WITH_PROFILE +#ifdef LITE_WITH_PRECISION_PROFILE #include "lite/core/profile/precision_profiler.h" #endif @@ -136,34 +136,35 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { } void RuntimeProgram::Run() { -#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE auto inst_precision_profiler = paddle::lite::profile::PrecisionProfiler(); std::string precision_profiler_summary = inst_precision_profiler.GetSummaryHeader(); -#endif #endif for (auto& inst : instructions_) { #ifndef LITE_WITH_FPGA if (inst.is_feed_fetch_op()) continue; +#endif +#ifdef LITE_WITH_CUDA + if (inst.need_sync()) { + inst.Sync(); + } #endif inst.Run(); -#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PRECISION_PROFILE #ifndef LITE_WITH_FPGA precision_profiler_summary += inst_precision_profiler.GetInstPrecision(&inst); #endif #endif // LITE_WITH_PRECISION_PROFILE -#endif // LITE_WITH_PROFILE } #ifdef LITE_WITH_PROFILE LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); +#endif #ifdef LITE_WITH_PRECISION_PROFILE LOG(INFO) << "\n" << precision_profiler_summary; -#endif // LITE_WITH_PRECISION_PROFILE -#endif // LITE_WITH_PROFILE +#endif } void Program::Build(const cpp::ProgramDesc& prog) { @@ -286,8 +287,7 @@ void Instruction::Run() { return; } - // op_->InferShape(); - op_->SmartInferShape(); + op_->InferShape(); kernel_->Launch(); has_run_ = true; } diff --git a/lite/core/program.h b/lite/core/program.h index c845a17c52c0c565e339a13e093f3e8f59e8d4a7..9d5fef7c0367d0e0fabf6ecff8b22e5e20a7bb57 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -108,6 +108,18 @@ struct Instruction { bool is_feed_fetch_op() const { return is_feed_fetch_op_; } +#ifdef LITE_WITH_CUDA + bool need_sync() const { + if (kernel_->target() == TargetType::kCUDA) { + return kernel_->mutable_context()->As().need_sync(); + } else { + // the io_copy kernel has synced, so cpu kernels don't need sync.. + return false; + } + } + void Sync() const { kernel_->mutable_context()->As().Sync(); } +#endif + #ifdef LITE_WITH_PROFILE void set_profiler(profile::Profiler* profiler) { profiler_ = profiler; diff --git a/lite/core/types.cc b/lite/core/types.cc index 4ea383333d519ac2c481dce459ca49124a64df32..a19c5ed0a33986237ce03213875929d34a2fb363 100644 --- a/lite/core/types.cc +++ b/lite/core/types.cc @@ -67,31 +67,31 @@ STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k) { template <> Type StdTypeToRepr() { - return Type::_int32; + return Type::INT32; } template <> Type StdTypeToRepr() { - return Type::_int64; + return Type::INT64; } template <> Type StdTypeToRepr() { - return Type::_float32; + return Type::FLOAT32; } template <> Type StdTypeToRepr() { - return Type::_float64; + return Type::Float64; } template <> Type StdTypeToRepr>() { - return Type::_char_list; + return Type::CHARLIST; } template <> Type StdTypeToRepr() { - return Type::_string; + return Type::STRING; } template <> Type StdTypeToRepr() { - return Type::_bool; + return Type::BOOL; } } // namespace core diff --git a/lite/core/types.h b/lite/core/types.h index 8f154f9dd509d3627750ecbf301923a2296252d1..66dc44746a7496d9805e8cc2b6bf2df89b33ddbf 100644 --- a/lite/core/types.h +++ b/lite/core/types.h @@ -29,23 +29,23 @@ namespace core { */ // TODO(Superjomn) unify all the type representation across the lite framework. enum class Type { - _unk = -1, - // primary types - _int32, - _int64, - _float32, - _float64, - _bool, - _string, + UNK = -1, + // primary typesINT32, + INT32, + INT64, + FLOAT32, + Float64, + BOOL, + STRING, // primary list type - _char_list, + CHARLIST, // list types - _list, + LIST, // enum type - _enum, - _float16, + ENUM, + FLOAT16, // number of types - __num__, + NUM, }; enum class FluidType { @@ -81,7 +81,7 @@ enum class FluidType { template Type StdTypeToRepr() { - return Type::_unk; + return Type::UNK; } template <> Type StdTypeToRepr(); @@ -92,6 +92,8 @@ Type StdTypeToRepr(); template <> Type StdTypeToRepr(); template <> +Type StdTypeToRepr(); +template <> Type StdTypeToRepr>(); template <> Type StdTypeToRepr(); diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index 0c9da1a76422edae45dfeec5d38556a5e2322a85..2a819883fa316bd1898c063912800b57804218db 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -18,6 +18,11 @@ #include "paddle_api.h" // NOLINT #include "paddle_use_passes.h" // NOLINT +#if defined(_WIN32) +#include "paddle_use_kernels.h" // NOLINT +#include "paddle_use_ops.h" // NOLINT +#endif + using namespace paddle::lite_api; // NOLINT DEFINE_string(model_dir, "", "Model dir path."); diff --git a/lite/demo/cxx/train_demo/README.md b/lite/demo/cxx/train_demo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..56f4513d45676a1deb51bfb93096db156ddd0449 --- /dev/null +++ b/lite/demo/cxx/train_demo/README.md @@ -0,0 +1,191 @@ + +# Introduction + 我们都知道,PaddleLite可以做移动端预测,事实上PaddleLite支持在移动端做模型训练。本文给出使用PaddleLite做训练的例子,这一例子对应的任务是“波士顿房价预测”,又称作“fit-a-line”。 + + 你可以通过book库中的 +[文档](https://paddlepaddle.org.cn/documentation/docs/zh/user_guides/simple_case/fit_a_line/README.cn.html) +和 +[源码](https://github.com/PaddlePaddle/book/tree/develop/01.fit_a_line) +进一步了解“波士顿房价预测”这一任务的定义及其建模过程, +其使用线性回归(Linear Regression) +模型做建模。本文主要介绍如何将其迁移至Paddle-Lite进行训练。 + +注:这是一篇使用C++ API做模型训练的教程,其他API暂时不支持训练功能。 + +# Requirements + +- 一部安卓手机,用于运行训练程序 +- 装了Paddle (version: 1.7.0) 的python + +# Quick start + +## Step1 build paddle-lite + +请按照[paddle-lite官方文档](https://paddle-lite.readthedocs.io/zh/latest/user_guides/source_compile.html#paddlelite) 的教程编译full_publish的paddle-lite lib。以Linux上编译为例,其具体的命令为: + +```shell +## 配置环境 +wget -c https://mms-res.cdn.bcebos.com/cmake-3.10.3-Linux-x86_64.tar.gz --no-check-certificate +tar xzf cmake-3.10.3-Linux-x86_64.tar.gz +export PATH=${PWD}'/cmake-3.10.3-Linux-x86_64/bin':$PATH + +wget https://dl.google.com/android/repository/android-ndk-r17c-linux-x86_64.zip +unzip android-ndk-r17c-linux-x86_64.zip +export NDK_ROOT=/opt/android-ndk-r17c + +## 编译 +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +cd Paddle-Lite +./lite/tools/build.sh \ + --arm_os=android \ + --arm_abi=armv7 \ + --build_extra=ON \ + --arm_lang=gcc \ + --android_stl=c++_static \ + --build_train=ON full_publish +``` + +产物: + +```shell +Paddle-Lite/build.lite.android.armv7.gcc/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so +``` + +## Step2 编译lr_trainer + +```shell +cd Paddle-Lite/lite/demo/cxx/train_demo/cplus_train/ +sh run_build.sh /path/to/your/Paddle-Lite/build.lite.android.armv7.gcc/ /path/to/your/android-ndk-r17c +``` + +产物: +```shell +bin/ +`-- demo_trainer +``` + +## Step3 download model and run it! + +在你的笔记本电脑上,用usb连接到手机,开启开发者模式,在任意目录下执行: + +```shell +local_path=/data/local/tmp/linear_regression +adb shell "mkdir "${local_path} + +# download model and push to mobile +wget http://paddle-tar.bj.bcebos.com/paddle-lite/lite_lr_model.tar.gz +tar -zxvf lite_lr_model.tar.gz +adb push lite_lr_model/housing.data ${local_path} +adb push lite_lr_model/model_dir ${local_path} + +# push lib and executable file to moblie +adb push libpaddle_full_api_shared.so ${local_path} +adb push demo_trainer ${local_path} +adb shell chmod +x ${local_path}/demo_trainer + +# run it! +adb shell "export LD_LIBRARY_PATH="${local_path}" && export LIBRARY_PATH="${local_path}" && cd "${local_path}" && ./demo_trainer true" +``` + +期望结果: + +``` +sample 0: Loss: 564.317 +sample 1: Loss: 463.9 +sample 2: Loss: 1197.54 +sample 3: Loss: 1093.83 +sample 4: Loss: 1282.76 +sample 5: Loss: 792.097 +sample 6: Loss: 491.776 +sample 7: Loss: 698.496 +sample 8: Loss: 248.445 +sample 9: Loss: 325.135 +``` + +# 更多细节 +上面提到的模型是直接下载得到的,如果你想自己生成,可以执行以下命令: + +```shell +git clone https://github.com/PaddlePaddle/Paddle-Lite.git +cd Paddle-Lite/lite/demo/cxx/train_demo/ +python train.py --save_model +``` + +产物: + +```shell +model_dir/ +|-- fc_0.b_0 +|-- fc_0.w_0 +|-- learning_rate_0 +`-- __model__ + +md5sum fc_0.w_0: 2c7b3649b2a9cf7bcd19f8b256ce795d +``` + +如果你想生成自己的模型用于训练,可以参考`train.py`中保存模型的方式。 + +# 与Paddle训练结果做校对 + +## 前10个Loss值 + +为了验证paddle与lite的一致性,我们控制模型参数一致、数据一致、batch size = 1的情况下,训练10个batch, 记录了二者的loss值。 + +python + paddle 命令: + +```shell + fluid train.py --num_steps=10 --batch_size=1 +``` + +python + paddle 结果: + +```shell +Train cost, Step 0, Cost 564.317017 +Train cost, Step 1, Cost 463.900238 +Train cost, Step 2, Cost 1197.537354 +Train cost, Step 3, Cost 1093.833008 +Train cost, Step 4, Cost 1282.760254 +Train cost, Step 5, Cost 792.097351 +Train cost, Step 6, Cost 491.775848 +Train cost, Step 7, Cost 698.496033 +Train cost, Step 8, Cost 248.444885 +Train cost, Step 9, Cost 325.135132 +``` + +c++ 与 paddle-lite命令: +``` +./demo_trainer true +``` + +c++ 与 paddle-lite结果: +``` +sample 0: Loss: 564.317 +sample 1: Loss: 463.9 +sample 2: Loss: 1197.54 +sample 3: Loss: 1093.83 +sample 4: Loss: 1282.76 +sample 5: Loss: 792.097 +sample 6: Loss: 491.776 +sample 7: Loss: 698.496 +sample 8: Loss: 248.445 +sample 9: Loss: 325.135 +``` + +## Loss 曲线 + +控制训练时的batch size为20,每个epoch对训练数据做全局shuffle,训练100个epoch后,paddle和lite的loss曲线对比如下。 + +![lr_loss](image/lr_loss.png) + +如果想复现上述效果,paddle+python的运行命令为: + +``` +git clone https://github.com/PaddlePaddle/book.git +cd book/01.fit_a_line +python train.py +``` + +lite + c++的运行命令为: +``` +./demo_trainer false +``` diff --git a/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b41808352a186e8ed434c0cf9364a9cae7d3928e --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/CMakeLists.txt @@ -0,0 +1,24 @@ +cmake_minimum_required(VERSION 2.8) +set (CMAKE_CXX_STANDARD 11) + +# Project's name + +if(NOT DEFINED LITE_ROOT) + message(FATAL_ERROR "please set LITE_ROOT with + -DLITE_ROOT=/path/to/your/build.lite.android.armv7.gcc/") +endif() + +project(demo_trainer) +# Set the output folder where your program will be created +set(CMAKE_BINARY_DIR ${CMAKE_SOURCE_DIR}/bin) +set(EXECUTABLE_OUTPUT_PATH ${CMAKE_BINARY_DIR}) +set(LIBRARY_OUTPUT_PATH ${CMAKE_BINARY_DIR}) + +# The following folder will be included +include_directories("include") +include_directories("${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/include") + +add_executable(demo_trainer ${PROJECT_SOURCE_DIR}/demo_trainer.cc ${PROJECT_SOURCE_DIR}/data_reader.cc) + +TARGET_LINK_LIBRARIES(demo_trainer +"${LITE_ROOT}/inference_lite_lib.android.armv7/cxx/lib/libpaddle_full_api_shared.so") diff --git a/lite/demo/cxx/train_demo/cplus_train/data_reader.cc b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc new file mode 100644 index 0000000000000000000000000000000000000000..4546e2e5fecc17321e8126485022b4ac30876747 --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/data_reader.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include/data_reader.h" +#include + +using std::string; +using std::vector; + +int FEATURE_NUM = 13; +float rate = 0.8; + +int get_samples(string line, vector* feature, float* label) { + std::istringstream reader(line); + std::vector numbers; + do { + // read as many numbers as possible. + for (float number; reader >> number;) { + numbers.push_back(number); + } + // consume and discard token from stream. + if (reader.fail()) { + reader.clear(); + std::string token; + reader >> token; + } + } while (!reader.eof()); + + assert(numbers.size() == FEATURE_NUM + 1); + for (int i = 0; i < FEATURE_NUM; i++) { + feature->push_back(numbers[i]); + } + *label = numbers[FEATURE_NUM]; + return 0; +} + +int normalize(const vector>& origin_features, + vector>* features, + float rate) { + int inf = std::numeric_limits::max(); + vector min_vec(FEATURE_NUM, static_cast(inf)); + vector max_vec(FEATURE_NUM, -(static_cast(inf))); + vector sum_vec(FEATURE_NUM, 0); + vector avg_vec(FEATURE_NUM, 0); + + for (int i = 0; i < origin_features.size(); i++) { + for (int j = 0; j < FEATURE_NUM; j++) { + min_vec[j] = min(min_vec[j], origin_features[i][j]); + max_vec[j] = max(max_vec[j], origin_features[i][j]); + sum_vec[j] += origin_features[i][j]; + } + } + + for (int i = 0; i < FEATURE_NUM; i++) { + avg_vec[i] = sum_vec[i] / origin_features.size(); + } + + for (int i = 0; i < origin_features.size() * rate - 1; i++) { + vector feat; + for (int j = 0; j < FEATURE_NUM; j++) { + feat.push_back((origin_features[i][j] - avg_vec[j]) / + (max_vec[j] - min_vec[j])); + } + features->push_back(feat); + } +} + +int read_samples(const string fname, + vector>* features, + vector* labels) { + fstream fin; + fin.open(fname); + if (!static_cast(fin)) { + return 1; + } + vector> origin_features; + vector lines; + string line; + while (getline(fin, line)) { + lines.push_back(line); + } + fin.close(); + + for (int i = 0; i < lines.size(); i++) { + vector feat; + float lbl = 0; + get_samples(lines[i], &feat, &lbl); + origin_features.push_back(feat); + if (i < lines.size() * rate - 1) { + labels->push_back(lbl); + } + } + + cout << "finish read fata" << endl; + normalize(origin_features, features, rate); + assert(features->size() == labels->size()); + return 0; +} diff --git a/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc new file mode 100644 index 0000000000000000000000000000000000000000..f035078fff35c4b2c0b41d0de84d2621c550d14e --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/demo_trainer.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "include/data_reader.h" +#include "paddle_api.h" // NOLINT + +using namespace paddle::lite_api; // NOLINT + +class LRModel { + public: + void InitModel() { + // 1. Set CxxConfig + CxxConfig config; + config.set_model_dir("model_dir"); + std::vector valid_places{Place{TARGET(kARM), PRECISION(kFloat)}}; + config.set_valid_places(valid_places); + predictor_ = CreatePaddlePredictor(config); + } + + float Predict(const vector>& features, + const vector& labels) { + // Create Tensor + assert(features.size() == labels.size()); + int batch_size = features.size(); + std::unique_ptr input_tensor(std::move(predictor_->GetInput(0))); + input_tensor->Resize(shape_t({batch_size, FEATURE_NUM})); + auto* data = input_tensor->mutable_data(); + for (int i = 0; i < batch_size; i++) { + for (int j = 0; j < FEATURE_NUM; j++) { + data[FEATURE_NUM * i + j] = features[i][j]; + } + } + std::unique_ptr y_tensor(std::move(predictor_->GetInput(1))); + y_tensor->Resize(shape_t({batch_size, 1})); + auto* y_data = y_tensor->mutable_data(); + for (int i = 0; i < batch_size; i++) { + y_data[i] = labels[i]; + } + predictor_->Run(); + std::unique_ptr output_tensor( + std::move(predictor_->GetOutput(0))); + return output_tensor->data()[0]; + } + + private: + std::shared_ptr predictor_; +}; + +int shuffle(vector>* features, vector* labels) { + assert(features->size() == labels->size()); + vector index; + for (int i = 0; i < features->size(); i++) { + index.push_back(i); + } + random_shuffle(index.begin(), index.end()); + + vector> tmp_features; + vector tmp_labels; + + for (int i = 0; i < features->size(); i++) { + tmp_features.push_back((*features)[index[i]]); + tmp_labels.push_back((*labels)[index[i]]); + } + + for (int i = 0; i < features->size(); i++) { + for (int j = 0; j < FEATURE_NUM; j++) { + (*features)[i][j] = tmp_features[i][j]; + } + (*labels)[i] = tmp_labels[i]; + } + return 0; +} + +int main(int argc, char* argv[]) { + if (argc < 2) { + cerr << "usage: ./demo_trainer is_small" << endl; + cerr << " if is_small is true, the batch size is set to 1, " << endl; + cerr << " and it will only runs for 10 steps." << endl; + return 1; + } + string is_small = argv[1]; + vector> features; + vector labels; + read_samples("housing.data", &features, &labels); + cout << "sample count: " << features.size() << " " << endl; + + std::shared_ptr local_model(new LRModel()); + local_model->InitModel(); + + if (is_small == "true") { + cout << "small mode" << endl; + for (int i; i < 10; i++) { + vector> batch_feature; + vector batch_label; + batch_feature.push_back(features[i]); + batch_label.push_back(labels[i]); + auto loss = local_model->Predict(batch_feature, batch_label); + cout << "sample " << i << ": " << loss << endl; + } + } else if (is_small == "false") { + // shuffle + cout << "full model" << endl; + int epoch = 100; + int batch_size = 20; + int step = 0; + for (int i; i < epoch; i++) { + shuffle(&features, &labels); + for (int j = 0; + j < ceil(static_cast(features.size()) / batch_size); + j++) { + int start_idx = j * batch_size; + int end_idx = + min((j + 1) * batch_size, static_cast(features.size())); + auto batch_feature = vector>(features.begin() + start_idx, + features.begin() + end_idx); + auto batch_label = + vector(labels.begin() + start_idx, labels.begin() + end_idx); + auto loss = local_model->Predict(batch_feature, batch_label); + if (step % 10 == 0) { + std::cout << "batch: " << i << ", step: " << step + << ", Loss: " << loss << endl; + } + step += 1; + } + } + } else { + cerr << "wrong arg for is_small: " << is_small << endl; + } +} diff --git a/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h new file mode 100644 index 0000000000000000000000000000000000000000..050e929c9135ac939dac747e2e4a2490397a4c3d --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/include/data_reader.h @@ -0,0 +1,37 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include + +using std::string; +using std::vector; +using std::cerr; +using std::cout; +using std::endl; +using std::min; +using std::max; +using std::fstream; + +extern int FEATURE_NUM; + +int get_samples(string line, const vector& feature, float* label); +int read_samples(const string fname, + vector>* features, + vector* labels); diff --git a/lite/demo/cxx/train_demo/cplus_train/run_build.sh b/lite/demo/cxx/train_demo/cplus_train/run_build.sh new file mode 100644 index 0000000000000000000000000000000000000000..4fb444ebd1ecda40db2d69c24016cb78bacdc0ad --- /dev/null +++ b/lite/demo/cxx/train_demo/cplus_train/run_build.sh @@ -0,0 +1,21 @@ + +rm -rf build +mkdir build +cd build + +LITE_ROOT=$1 +NDK_ROOT=$2 + + +cmake .. \ + -DLITE_ROOT=${LITE_ROOT} \ + -DNDK_ROOT=${NDK_ROOT} \ + -DCMAKE_TOOLCHAIN_FILE=${NDK_ROOT}/build/cmake/android.toolchain.cmake \ + -DANDROID_TOOLCHAIN=gcc \ + -DANDROID_ABI="armeabi-v7a" \ + -DANDROID_PLATFORM=android-23 \ + -DANDROID=true \ + -DANDROID_STL=c++_static +make +cd .. +# ./bin/demo_trainer diff --git a/lite/demo/cxx/train_demo/image/lr_loss.png b/lite/demo/cxx/train_demo/image/lr_loss.png new file mode 100644 index 0000000000000000000000000000000000000000..626cb57ecd5d4cf50fd4d0b8aaadcc29146ca19b Binary files /dev/null and b/lite/demo/cxx/train_demo/image/lr_loss.png differ diff --git a/lite/demo/cxx/train_demo/train.py b/lite/demo/cxx/train_demo/train.py new file mode 100644 index 0000000000000000000000000000000000000000..37825a5cc472990664f68cb38dbf7ee7859286b8 --- /dev/null +++ b/lite/demo/cxx/train_demo/train.py @@ -0,0 +1,135 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +import argparse + +import math +import numpy + +import paddle +import paddle.fluid as fluid + + +def parse_args(): + parser = argparse.ArgumentParser("fit_a_line") + parser.add_argument( + '--save_model', + action='store_true', + help="Whether to save main program") + parser.add_argument( + '--num_steps', + type=int, + default=1000000000000, + help="train steps") + parser.add_argument( + '--num_epochs', type=int, default=100, help="number of epochs.") + parser.add_argument( + '--batch_size', type=int, default=20, help="batch size.") + parser.add_argument( + '--shuffle', + action='store_true', + help="Whether to shuffle train data.") + args = parser.parse_args() + return args + +# For training test cost +def train_test(executor, program, reader, feeder, fetch_list): + accumulated = 1 * [0] + count = 0 + for data_test in reader(): + outs = executor.run( + program=program, feed=feeder.feed(data_test), fetch_list=fetch_list) + accumulated = [x_c[0] + x_c[1][0] for x_c in zip(accumulated, outs)] + count += 1 + return [x_d / count for x_d in accumulated] + + +def main(): + if args.shuffle: + print("doing shuffle") + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.uci_housing.train(), buf_size=500), + batch_size=args.batch_size) + else: + train_reader = paddle.batch( + paddle.dataset.uci_housing.train(), batch_size=args.batch_size) + + # feature vector of length 13 + x = fluid.data(name='x', shape=[None, 13], dtype='float32') + y = fluid.data(name='y', shape=[None, 1], dtype='float32') + + main_program = fluid.default_main_program() + startup_program = fluid.default_startup_program() + + main_program.random_seed = 90 + startup_program.random_seed = 90 + + y_predict = fluid.layers.fc(input=x, size=1, act=None) + cost = fluid.layers.square_error_cost(input=y_predict, label=y) + avg_loss = fluid.layers.mean(cost) + + test_program = main_program.clone(for_test=True) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) + sgd_optimizer.minimize(avg_loss) + + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + num_epochs = args.num_epochs + + # main train loop. + feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) + exe.run(startup_program) + if args.save_model: + fluid.io.save_persistables(exe, "model_dir") + + # add feed and fetch op + feeded_var_names = ['x', 'y'] + fetch_var_names = ['mean_0.tmp_0'] + fluid.io.prepend_feed_ops(main_program, feeded_var_names) + fluid.io.append_fetch_ops(main_program, fetch_var_names) + with open("model_dir/__model__", "wb") as f: + f.write(main_program.desc.serialize_to_string()) + + with open("debug_main_program", "w") as f: + f.write(str(main_program)) + print("train model saved to model_dir") + return + + train_prompt = "Train cost" + step = 0 + for pass_id in range(num_epochs): + for data_train in train_reader(): + avg_loss_value, = exe.run( + main_program, + feed=feeder.feed(data_train), + fetch_list=[avg_loss]) + print("%s, Step %d, Cost %f" % + (train_prompt, step, avg_loss_value[0])) + if step == args.num_steps - 1: + return + step += 1 + + if math.isnan(float(avg_loss_value[0])): + sys.exit("got NaN loss, training failed.") + + +if __name__ == '__main__': + args = parse_args() + main() diff --git a/lite/demo/python/mobilenetv1_full_api.py b/lite/demo/python/mobilenetv1_full_api.py index a31469e3e8da81f3753dc5d241d4ef39ac03832f..c3a6bd077be5978f1ecaf9b040b119e50117d62b 100644 --- a/lite/demo/python/mobilenetv1_full_api.py +++ b/lite/demo/python/mobilenetv1_full_api.py @@ -23,7 +23,7 @@ import argparse import sys sys.path.append('../../python/lib') -from lite_core import * +from paddlelite.lite import * # Command arguments parser = argparse.ArgumentParser() diff --git a/lite/demo/python/mobilenetv1_light_api.py b/lite/demo/python/mobilenetv1_light_api.py index a44427092bae88aa41b3b1d0684cfcf36835b3d2..5847c7819366b654dd9d5b5cbe2108b54da7b04c 100644 --- a/lite/demo/python/mobilenetv1_light_api.py +++ b/lite/demo/python/mobilenetv1_light_api.py @@ -23,7 +23,7 @@ import argparse import sys sys.path.append('../../python/lib') -from lite_core import * +from paddlelite.lite import * # Command arguments parser = argparse.ArgumentParser() diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc index d33a77c4bfcefbc349d453de05dcbb7c27707a19..9c96459993e55b441ea795c4f2cb58f40846c0d9 100644 --- a/lite/fluid/data_type.cc +++ b/lite/fluid/data_type.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "lite/fluid/data_type.h" #include #include diff --git a/lite/fluid/lod.h b/lite/fluid/lod.h index 36386f7eb967f31ec258681fe17222a928aa7b4b..b1f2f14a0a4534e588d18237826858812740db69 100644 --- a/lite/fluid/lod.h +++ b/lite/fluid/lod.h @@ -19,7 +19,7 @@ namespace paddle { namespace lite { namespace fluid { -using LoD = std::vector>; +using LoD = std::vector>; static LoD ToAbsOffset(const LoD &in) { // the lowest level stores relative offsets diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 40c95415546d99a66abf2d6f3595ae8695c4df86..2416278ad74068d28f6de523c55513891b08cc72 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} @@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 78bb8d10b798b73861ddbf25e427289fc2984a55..b00e818c6cd21de717dab7b896a8f757b5b0011a 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -12,3 +12,4 @@ add_subdirectory(npu) add_subdirectory(xpu) add_subdirectory(mlu) add_subdirectory(bm) +add_subdirectory(rknpu) diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index 7550d770145d92ebd343f96a82c6f34d72c91ea5..83c85842f90900496e1a0ed4149a47234899d2f9 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -63,7 +63,6 @@ add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -92,7 +91,6 @@ add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_ add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -106,13 +104,12 @@ add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math # 4. training kernels add_kernel(mean_compute_arm ARM extra SRCS mean_compute.cc DEPS ${lite_kernel_deps} math_arm) -if(LITE_WITH_TRAIN) - add_kernel(mean_grad_compute_arm ARM extra SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) - add_kernel(activation_grad_compute_arm ARM basic SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) - add_kernel(elementwise_grad_compute_arm ARM basic SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) - add_kernel(mul_grad_compute_arm ARM extra SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) - add_kernel(sgd_compute_arm ARM extra SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) -endif() + +add_kernel(mean_grad_compute_arm ARM train SRCS mean_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(activation_grad_compute_arm ARM train SRCS activation_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(elementwise_grad_compute_arm ARM train SRCS elementwise_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(mul_grad_compute_arm ARM train SRCS mul_grad_compute.cc DEPS ${lite_kernel_deps} math_arm) +add_kernel(sgd_compute_arm ARM train SRCS sgd_compute.cc DEPS ${lite_kernel_deps} math_arm) lite_cc_test(test_scale_compute_arm SRCS scale_compute_test.cc DEPS scale_compute_arm) lite_cc_test(test_softmax_compute_arm SRCS softmax_compute_test.cc DEPS softmax_compute_arm) diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc index d609716ee53ec584b8340e9b72498ed95afd5820..085e914c6e05c26d3031a4cfdac3c39d31f40f6d 100644 --- a/lite/kernels/arm/activation_compute.cc +++ b/lite/kernels/arm/activation_compute.cc @@ -179,6 +179,44 @@ void SquareCompute::Run() { x_data, output_data, x_dims.production(), ctx.threads()); } +void HardSwishCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + float threshold = param.hard_swish_threshold; + float scale = param.hard_swish_scale; + float offset = param.hard_swish_offset; + lite::arm::math::act_hard_swish(x_data, + output_data, + x_dims.production(), + threshold, + scale, + offset, + ctx.threads()); +} + +void ReciprocalCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_reciprocal( + x_data, output_data, x_dims.production(), ctx.threads()); +} + +void AbsCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_abs( + x_data, output_data, x_dims.production(), ctx.threads()); +} + } // namespace arm } // namespace kernels } // namespace lite @@ -275,3 +313,26 @@ REGISTER_LITE_KERNEL( .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +REGISTER_LITE_KERNEL(hard_swish, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::HardSwishCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +REGISTER_LITE_KERNEL(reciprocal, + kARM, + kFloat, + kNCHW, + paddle::lite::kernels::arm::ReciprocalCompute, + def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); +REGISTER_LITE_KERNEL( + abs, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AbsCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h index 476d7bb0a32db193d9afb1451507699d0af71736..2e9774637b7a9156197ffeff5f4bca13a20620bb 100644 --- a/lite/kernels/arm/activation_compute.h +++ b/lite/kernels/arm/activation_compute.h @@ -148,6 +148,33 @@ class SquareCompute : public KernelLite { virtual ~SquareCompute() = default; }; +class HardSwishCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~HardSwishCompute() = default; +}; + +class ReciprocalCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~ReciprocalCompute() = default; +}; + +class AbsCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~AbsCompute() = default; +}; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc deleted file mode 100644 index 709942a0d9f385e4ba55be32657633c0edc378cf..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/compare_compute.cc +++ /dev/null @@ -1,295 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/compare_compute.h" -#include -#include "lite/api/paddle_place.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -#define COMPARE_FUNCTOR(name, op) \ - template \ - struct _##name##Functor { \ - inline bool operator()(const T &a, const T &b) const { return a op b; } \ - }; - -COMPARE_FUNCTOR(Equal, ==); -COMPARE_FUNCTOR(NotEqual, !=); -COMPARE_FUNCTOR(LessThan, <); -COMPARE_FUNCTOR(LessEqual, <=); -COMPARE_FUNCTOR(GreaterThan, >); -COMPARE_FUNCTOR(GreaterEqual, >=); - -template <> -struct _EqualFunctor { - inline bool operator()(const float &a, const float &b) const { - // It is safe to cast a and b to double. - return fabs(static_cast(a - b)) < 1e-8; - } -}; - -template <> -struct _NotEqualFunctor { - inline bool operator()(const float &a, const float &b) const { - return !_EqualFunctor()(a, b); - } -}; - -inline void get_mid_dims(const lite::DDim &x_dims, - const lite::DDim &y_dims, - const int axis, - int *pre, - int *n, - int *post) { - *pre = 1; - *n = 1; - *post = 1; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - - for (int i = 0; i < y_dims.size(); ++i) { - (*n) *= y_dims[i]; - } - - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; - } -} - -template