diff --git a/CMakeLists.txt b/CMakeLists.txt index aefe8cc19c586381aea83645e80b1fd700959bbc..065bcbe3490d7d8ba92dbd17d115d7fefe5c1ec6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") message(STATUS "AR tools: ${CMAKE_AR}") + +if(WIN32) + option(MSVC_STATIC_CRT "use static C Runtime library by default" ON) + + set(CMAKE_SUPPRESS_REGENERATION ON) + set(CMAKE_STATIC_LIBRARY_PREFIX lib) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + + if (MSVC_STATIC_CRT) + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") + endif() + + add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838) + add_compile_options(/MP) + message(STATUS "Using parallel compiling (/MP)") + set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221") + set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") + +endif() + if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) find_package(CUDA QUIET) endif() @@ -59,10 +84,12 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) +lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF) lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) +lite_option(LITE_WITH_APU "Enable APU in lite mode" OFF) lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) @@ -105,9 +132,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) + if(WIN32) + set(CMAKE_BUILD_TYPE "Release" CACHE STRING + "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" + FORCE) + else() + set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" FORCE) + endif() endif() message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") @@ -129,6 +163,10 @@ if (LITE_WITH_PYTHON) include(external/pybind11) # download, build, install pybind11 endif() +if(LITE_WITH_RKNPU) + include(device/rknpu) +endif() + # for mobile if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) @@ -136,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) include(cross_compiling/postproject) include(device/npu) # check and prepare NPU DDK include(device/xpu) # check and prepare XPU SDK + include(device/apu) # check and prepare APU SDK # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON # So the following third party dependencies are not needed. @@ -185,6 +224,7 @@ endif() include(external/mklml) # download mklml package include(external/xbyak) # download xbyak package + include(external/libxsmm) # download, build, install libxsmm include(external/gflags) # download, build, install gflags include(external/glog) # download, build, install glog @@ -209,7 +249,9 @@ include(generic) # simplify cmake module include(ccache) # set ccache for compilation include(util) # set unittest and link libs include(version) # set PADDLE_VERSION -include(flags) +if(NOT APPLE) + include(flags) +endif() set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") diff --git a/build.bat b/build.bat new file mode 100644 index 0000000000000000000000000000000000000000..4510ee774ed9a3b9fe5a9d55b405b1dae39c3f45 --- /dev/null +++ b/build.bat @@ -0,0 +1,134 @@ +@echo off +setlocal +setlocal enabledelayedexpansion + +set source_path=%~dp0 +rem global variables +set BUILD_EXTRA=OFF +set BUILD_JAVA=ON +set BUILD_PYTHON=OFF +set BUILD_DIR=%source_path% +set OPTMODEL_DIR="" +set BUILD_TAILOR=OFF +set BUILD_CV=OFF +set SHUTDOWN_LOG=ON + +set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz + +set workspace=%source_path% + +:set_vcvarsall_dir +SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat =======>" +set tmp_var=!vcvarsall_dir! +call:remove_space +set vcvarsall_dir=!tmp_var! +IF NOT EXIST "%vcvarsall_dir%" ( + echo "------------%vcvarsall_dir% not exist------------" + goto set_vcvarsall_dir +) + +call:prepare_thirdparty + +if EXIST "%build_directory%" ( + call:rm_rebuild_dir "%build_directory%" + md "%build_directory%" +) + +set root_dir=%workspace% +set build_directory=%BUILD_DIR%\build.lite.x86 +set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code +set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug + +rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake. +rem here we fake an empty file to make cmake works. +if NOT EXIST "%GEN_CODE_PATH_PREFIX%" ( + md "%GEN_CODE_PATH_PREFIX%" +) + +type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc" + +if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" ( + md "%DEBUG_TOOL_PATH_PREFIX%" +) + +copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\" + +cd "%build_directory%" + + cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_MKL=ON ^ + -DWITH_MKLDNN=OFF ^ + -DLITE_WITH_X86=ON ^ + -DLITE_WITH_PROFILE=OFF ^ + -DWITH_LITE=ON ^ + -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^ + -DLITE_WITH_ARM=OFF ^ + -DWITH_GPU=OFF ^ + -DLITE_BUILD_EXTRA=ON ^ + -DLITE_WITH_PYTHON=ON ^ + -DPYTHON_EXECUTABLE="%python_path%" + +call "%vcvarsall_dir%" amd64 + +msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1 +goto:eof + +:prepare_thirdparty + SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe =======>" + set tmp_var=!python_path! + call:remove_space + set python_path=!tmp_var! + if "!python_path!"=="" ( + set python_path=python.exe + ) else ( + if NOT exist "!python_path!" ( + echo "------------!python_path! not exist------------" + goto:eof + ) + ) + + if EXIST "%workspace%\third-party" ( + if NOT EXIST "%workspace%\third-party-05b862.tar.gz" ( + echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists." + ) else ( + echo "The directory of third_party exists, the third-party-05b862.tar.gz exists." + call:rm_rebuild_dir "%workspace%\third-party" + !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace% + ) + ) else ( + if NOT EXIST "%workspace%\third-party-05b862.tar.gz" ( + echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists." + call:download_third_party + !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace% + ) else ( + echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists." + !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace% + ) + + ) + git submodule update --init --recursive +goto:eof + +:download_third_party +powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^ +'%workspace%third-party-05b862.tar.gz') +goto:eof + +:rm_rebuild_dir + del /f /s /q "%~1\*.*" >nul 2>&1 + rd /s /q "%~1" >nul 2>&1 +goto:eof + + +:remove_space +:remove_left_space +if "%tmp_var:~0,1%"==" " ( + set "tmp_var=%tmp_var:~1%" + goto remove_left_space +) + +:remove_right_space +if "%tmp_var:~-1%"==" " ( + set "tmp_var=%tmp_var:~0,-1%" + goto remove_left_space +) +goto:eof \ No newline at end of file diff --git a/cmake/configure.cmake b/cmake/configure.cmake index caf456367047277344f0353b6c142b039a81b12c..cf99645409436f24533005b9a74f2bdb1c89f662 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -34,6 +34,15 @@ elseif(SSE3_FOUND) set(SIMD_FLAG ${SSE3_FLAG}) endif() +if(WIN32) + # windows header option for all targets. + add_definitions(-D_XKEYCHECK_H) + + if (NOT MSVC) + message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.") + endif(NOT MSVC) +endif(WIN32) + if(LITE_WITH_CUDA) add_definitions(-DLITE_WITH_CUDA) add_definitions(-DEIGEN_USE_GPU) @@ -70,7 +79,7 @@ endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") - if(WIN32) + if(WIN32 OR APPLE) # openmp not support well for now on windows set(OPENMP_FLAGS "") else(WIN32) @@ -134,6 +143,14 @@ if (LITE_WITH_NPU) add_definitions("-DLITE_WITH_NPU") endif() +if (LITE_WITH_APU) + add_definitions("-DLITE_WITH_APU") +endif() + +if (LITE_WITH_RKNPU) + add_definitions("-DLITE_WITH_RKNPU") +endif() + if (LITE_WITH_XPU) add_definitions("-DLITE_WITH_XPU") if (LITE_WITH_XTCL) @@ -181,3 +198,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL") endif(LITE_ON_MODEL_OPTIMIZE_TOOL) +if (LITE_WITH_PYTHON) + add_definitions("-DLITE_WITH_PYTHON") +endif(LITE_WITH_PYTHON) diff --git a/cmake/device/apu.cmake b/cmake/device/apu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..d32e77f867ba3a7628475f8ea06816aa14097442 --- /dev/null +++ b/cmake/device/apu.cmake @@ -0,0 +1,65 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_APU) + return() +endif() + +if(NOT DEFINED APU_DDK_ROOT) + set(APU_DDK_ROOT $ENV{APU_DDK_ROOT}) + if(NOT APU_DDK_ROOT) + message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON") + endif() +endif() + +message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}") +find_path(APU_DDK_INC NAMES NeuronAdapter.h + PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH) +if(NOT APU_DDK_INC) + message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include") +endif() +message(STATUS "APU_DDK_INC: ${APU_DDK_INC}") + +include_directories("${APU_DDK_ROOT}/include") + +set(APU_SUB_LIB_PATH "lib64") +if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") + set(APU_SUB_LIB_PATH "lib64") +endif() + +find_library(APU_NEURON_FILE NAMES neuron + PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH}) + +find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter + PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH}) + +if(NOT APU_NEURON_FILE) + message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}") +else() + message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}") + add_library(apu_neuron SHARED IMPORTED GLOBAL) + set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE}) +endif() + +if(NOT APU_NEURON_ADAPTER_FILE) + message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}") +else() + message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}") + add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL) + set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE}) +endif() + +set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs") +message(STATUS "${apu_runtime_libs}") + diff --git a/cmake/device/rknpu.cmake b/cmake/device/rknpu.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7d430888072b0219bba3112534818d2e10a55579 --- /dev/null +++ b/cmake/device/rknpu.cmake @@ -0,0 +1,55 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if(NOT LITE_WITH_RKNPU) + return() +endif() + +if(NOT DEFINED RKNPU_DDK_ROOT) + set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT}) + if(NOT RKNPU_DDK_ROOT) + message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON") + endif() +endif() + +message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}") +find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h + PATHS ${RKNPU_DDK_ROOT}/include/ NO_DEFAULT_PATH) +if(NOT RKNPU_DDK_INC) + message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include") +endif() + +include_directories("${RKNPU_DDK_ROOT}/include") + +set(RKNPU_SUB_LIB_PATH "lib64") +if(ARM_TARGET_ARCH_ABI STREQUAL "armv8") + set(RKNPU_SUB_LIB_PATH "lib64") +endif() + +if(ARM_TARGET_ARCH_ABI STREQUAL "armv7") + set(RKNPU_SUB_LIB_PATH "lib") +endif() + +find_library(RKNPU_DDK_FILE NAMES rknpu_ddk + PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}) + +if(NOT RKNPU_DDK_FILE) + message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}") +else() + message(STATUS "Found RKNPU_DDK_FILE Library: ${RKNPU_DDK_FILE}") + add_library(rknpu_ddk SHARED IMPORTED GLOBAL) + set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE}) +endif() + +set(rknpu_runtime_libs rknpu_ddk CACHE INTERNAL "rknpu ddk runtime libs") diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 599e7bba7eaf12da7506ce44e706bd9f50ec6998..5a757659bb036ca99326bc40cc075f761ba6e641 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -36,7 +36,16 @@ else() # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen GIT_TAG - URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + ###################################################################################################### + # url address of eigen before v2.3.0 + # URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip + ###################################################################################################### + # url address of eigen since v2.6.0 + # github address: https://github.com/eigenteam/eigen-git-mirror + # we changed the source code to adapt for windows compiling + # git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h + ###################################################################################################### + URL https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} DOWNLOAD_NO_PROGRESS 1 PREFIX ${EIGEN_SOURCE_DIR} diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 142fce816de4f06aa0a36b91e3e4ecb962a8dc2a..8d094d6e064fe57b170d1a50a5457c104d3c3ac2 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML}) return() ENDIF(NOT ${WITH_MKLML}) -IF(APPLE) - MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.") - SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE) - return() -ENDIF() - INCLUDE(ExternalProject) SET(MKLML_DST_DIR "mklml") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") @@ -38,7 +32,17 @@ IF(WIN32) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) + SET(MKLML_SHARED_LIB_DEPS ${MKLML_LIB_DIR}/msvcr120.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) +ELSEIF(APPLE) + #TODO(intel-huying): + # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. + SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE) + SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE) + SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml.dylib) + SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib) + SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml.dylib) + SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib) ELSE() #TODO(intel-huying): # Now enable Erf function in mklml library temporarily, it will be updated as offical version later. diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index ae99f4df9a3676ae8f5b2c4c01305ead9b7a8254..57e332f1c103b28a194670de609ee521aa41cdf3 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) SET(py_env "") IF(PYTHONINTERP_FOUND) find_python_module(pip REQUIRED) - find_python_module(numpy REQUIRED) + #find_python_module(numpy REQUIRED) #find_python_module(wheel REQUIRED) #find_python_module(google.protobuf REQUIRED) - FIND_PACKAGE(NumPy REQUIRED) + #FIND_PACKAGE(NumPy REQUIRED) #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0") # MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " # "please use pip to upgrade protobuf. pip install -U protobuf") diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 225a3c19a16435c4df6403ff7d1bdd01e628dd72..d859404d559282970d96a735c400f745481e8efa 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -276,7 +276,7 @@ function(cc_library TARGET_NAME) add_dependencies(${TARGET_NAME} mklml) if(WIN32) target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) - else(WIN32) + elseif(NOT APPLE) target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") endif(WIN32) endif() diff --git a/cmake/lite.cmake b/cmake/lite.cmake index a07edaa57533e35943aedc5dbf812598d6215714..8408a79fa4265b08771e435dcc5e82801a9d40f9 100644 --- a/cmake/lite.cmake +++ b/cmake/lite.cmake @@ -22,7 +22,7 @@ endfunction() function (lite_deps TARGET) set(options "") set(oneValueArgs "") - set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) + set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) set(deps ${lite_deps_DEPS}) @@ -88,6 +88,18 @@ function (lite_deps TARGET) endforeach(var) endif() + if (LITE_WITH_APU) + foreach(var ${lite_deps_APU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + + if (LITE_WITH_RKNPU) + foreach(var ${lite_deps_RKNPU_DEPS}) + set(deps ${deps} ${var}) + endforeach(var) + endif() + if (LITE_WITH_XPU) foreach(var ${lite_deps_XPU_DEPS}) set(deps ${deps} ${var}) @@ -131,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean function(lite_cc_library TARGET) set(options SHARED shared STATIC static MODULE module) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -142,10 +154,12 @@ function(lite_cc_library TARGET) CUDA_DEPS ${args_CUDA_DEPS} CL_DEPS ${args_CL_DEPS} BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} ARM_DEPS ${args_ARM_DEPS} CV_DEPS ${args_CV_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} @@ -161,8 +175,10 @@ function(lite_cc_library TARGET) else() cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) endif() - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() # collect targets need to compile for lite if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS) add_dependencies(lite_compile_deps ${TARGET}) @@ -177,7 +193,7 @@ function(lite_cc_binary TARGET) set(options " -g ") endif() set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -190,8 +206,10 @@ function(lite_cc_binary TARGET) ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -199,7 +217,9 @@ function(lite_cc_binary TARGET) MLU_DEPS ${args_MLU_DEPS} ) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() if (NOT APPLE) # strip binary target to reduce size if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") @@ -226,7 +246,7 @@ function(lite_cc_test TARGET) endif() set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS COMPILE_LEVEL # (basic|extra) @@ -247,8 +267,10 @@ function(lite_cc_test TARGET) ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} HVY_DEPS ${args_HVY_DEPS} @@ -263,7 +285,9 @@ function(lite_cc_test TARGET) "${TARGET}" COMMENT "Strip debug symbols done on final executable file.") endif() - target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + if(NOT WIN32) + target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) + endif() file(APPEND ${offline_test_registry_file} "${TARGET}\n") # collect targets need to compile for lite @@ -277,9 +301,11 @@ set(x86_kernels CACHE INTERNAL "x86 kernels") set(cuda_kernels CACHE INTERNAL "cuda kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels") set(npu_kernels CACHE INTERNAL "npu kernels") +set(apu_kernels CACHE INTERNAL "apu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels") set(bm_kernels CACHE INTERNAL "bm kernels") +set(rknpu_kernels CACHE INTERNAL "rknpu kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels") set(host_kernels CACHE INTERNAL "host kernels") @@ -295,12 +321,12 @@ if(LITE_BUILD_TAILOR) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) endif() # add a kernel for some specific device -# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM) +# device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU) # level: one of (basic, extra) function(add_kernel TARGET device level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -323,6 +349,12 @@ function(add_kernel TARGET device level) if ("${device}" STREQUAL "Host") + if (LITE_ON_MODEL_OPTIMIZE_TOOL) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") endif() if ("${device}" STREQUAL "ARM") @@ -352,6 +384,15 @@ function(add_kernel TARGET device level) endif() set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "APU") + if (NOT LITE_WITH_APU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "XPU") if (NOT LITE_WITH_XPU) foreach(src ${args_SRCS}) @@ -379,8 +420,20 @@ function(add_kernel TARGET device level) endif() set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") endif() + if ("${device}" STREQUAL "RKNPU") + if (NOT LITE_WITH_RKNPU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() + return() + endif() + set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "") + endif() if ("${device}" STREQUAL "MLU") if (NOT LITE_WITH_MLU) + foreach(src ${args_SRCS}) + file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n") + endforeach() return() endif() set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") @@ -423,8 +476,10 @@ function(add_kernel TARGET device level) ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} @@ -444,7 +499,7 @@ endif() function(add_operator TARGET level) set(options "") set(oneValueArgs "") - set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS + set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -477,8 +532,10 @@ function(add_operator TARGET level) ARM_DEPS ${args_ARM_DEPS} FPGA_DEPS ${args_FPGA_DEPS} NPU_DEPS ${args_NPU_DEPS} + APU_DEPS ${args_APU_DEPS} XPU_DEPS ${args_XPU_DEPS} - BM_DEPS ${args_BM_DEPS} + RKNPU_DEPS ${args_RKNPU_DEPS} + BM_DEPS ${args_BM_DEPS} MLU_DEPS ${args_MLU_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS} @@ -486,6 +543,29 @@ function(add_operator TARGET level) ) endfunction() +#only for windows +function(create_static_lib TARGET_NAME) + set(libs ${ARGN}) + list(REMOVE_DUPLICATES libs) + set(dummy_index 1) + set(dummy_offset 1) + # the dummy target would be consisted of limit size libraries + set(dummy_limit 60) + list(LENGTH libs libs_len) + + foreach(lib ${libs}) + list(APPEND dummy_list ${lib}) + list(LENGTH dummy_list listlen) + if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len})) + merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list}) + set(dummy_list) + list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index}) + MATH(EXPR dummy_index "${dummy_index}+1") + endif() + MATH(EXPR dummy_offset "${dummy_offset}+1") + endforeach() + merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list}) +endfunction() # Bundle several static libraries into one. function(bundle_static_library tgt_name bundled_tgt_name fake_target) @@ -529,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target) set(bundled_tgt_full_name ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}) - #message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}") + message(STATUS "bundled_tgt_full_name: ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}") + + if(WIN32) + set(dummy_tgt_name dummy_${bundled_tgt_name}) + create_static_lib(${bundled_tgt_name} ${static_libs}) + add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name}) + add_dependencies(${fake_target} ${tgt_name}) + + add_library(${dummy_tgt_name} STATIC IMPORTED) + set_target_properties(${dummy_tgt_name} + PROPERTIES + IMPORTED_LOCATION ${bundled_tgt_full_name} + INTERFACE_INCLUDE_DIRECTORIES $) + add_dependencies(${dummy_tgt_name} ${fake_target}) + return() + endif() if(NOT IOS) file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in diff --git a/lite/CMakeLists.txt b/lite/CMakeLists.txt index e7c4e5fcc5c00929058f11160d0f87d13cbe7f4b..e2b15b187bf6dd3b77fe353f23b5d65bf56e44c7 100644 --- a/lite/CMakeLists.txt +++ b/lite/CMakeLists.txt @@ -7,7 +7,9 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") +message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") +message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") @@ -70,12 +72,18 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) if (LITE_WITH_XPU) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu") endif(LITE_WITH_XPU) + if (LITE_WITH_APU) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.apu") + endif(LITE_WITH_APU) if (LITE_WITH_FPGA) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga") endif(LITE_WITH_FPGA) if (LITE_WITH_BM) set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm") endif(LITE_WITH_BM) + if (LITE_WITH_RKNPU) + set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu") + endif(LITE_WITH_RKNPU) else() set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") endif() @@ -83,16 +91,59 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") # add python lib if (LITE_WITH_PYTHON) - add_custom_target(publish_inference_python_lib ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + if(WIN32) + set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd") + set(LITE_CORE_DEPS ${LITE_CORE}) + add_custom_command(OUTPUT ${LITE_CORE} + COMMAND cmake -E copy $ ${LITE_CORE} + DEPENDS lite_pybind) + add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS}) + + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd" + DEPENDS copy_lite_pybind + ) + + add_custom_target(publish_inference_python_installer ${TARGET} + COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel + WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ + DEPENDS publish_inference_python_lib) + add_custom_target(publish_inference_python_light_demo ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/" + ) + add_dependencies(publish_inference publish_inference_python_lib) + add_dependencies(publish_inference publish_inference_python_installer) + add_dependencies(publish_inference publish_inference_python_light_demo) + else() + if(APPLE) + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + else() + add_custom_target(publish_inference_python_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so") + endif() add_custom_target(publish_inference_python_installer ${TARGET} - COMMAND python setup.py bdist_wheel + COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ DEPENDS publish_inference_python_lib) add_custom_target(publish_inference_python_light_demo ${TARGET} @@ -108,30 +159,78 @@ if (LITE_WITH_PYTHON) add_dependencies(publish_inference publish_inference_python_lib) add_dependencies(publish_inference publish_inference_python_installer) add_dependencies(publish_inference publish_inference_python_light_demo) + endif(WIN32) endif() if (LITE_WITH_CUDA OR LITE_WITH_X86) - add_custom_target(publish_inference_cxx_lib ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" - COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" - COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" - ) - add_custom_target(publish_inference_third_party ${TARGET} - COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" - COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party") - add_dependencies(publish_inference_cxx_lib bundle_full_api) - add_dependencies(publish_inference_cxx_lib bundle_light_api) - add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) - add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) - add_dependencies(publish_inference publish_inference_cxx_lib) - add_dependencies(publish_inference publish_inference_third_party) + if(APPLE) + add_custom_target(publish_inference_cxx_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + add_custom_target(publish_inference_third_party ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party") + add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) + add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) + add_dependencies(publish_inference publish_inference_cxx_lib) + add_dependencies(publish_inference publish_inference_third_party) + elseif(NOT WIN32) + add_custom_target(publish_inference_cxx_lib ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + add_custom_target(publish_inference_third_party ${TARGET} + COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party") + add_dependencies(publish_inference_cxx_lib bundle_full_api) + add_dependencies(publish_inference_cxx_lib bundle_light_api) + add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) + add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) + add_dependencies(publish_inference publish_inference_cxx_lib) + add_dependencies(publish_inference publish_inference_third_party) + endif() endif() if (LITE_WITH_X86) + if(WIN32) + add_custom_target(publish_inference_x86_cxx_lib ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" + ) + + add_dependencies(publish_inference_x86_cxx_lib bundle_full_api) + add_dependencies(publish_inference_x86_cxx_lib bundle_light_api) + add_dependencies(publish_inference publish_inference_x86_cxx_lib) + + add_custom_target(publish_inference_x86_cxx_demos ${TARGET} + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party" + COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx" + ) + add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos) + add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3) + + else() + add_custom_target(publish_inference_x86_cxx_lib ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" @@ -146,6 +245,7 @@ if (LITE_WITH_X86) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) add_dependencies(publish_inference publish_inference_x86_cxx_lib) add_dependencies(publish_inference publish_inference_x86_cxx_demos) + endif() endif() if(LITE_WITH_CUDA) diff --git a/lite/api/CMakeLists.txt b/lite/api/CMakeLists.txt index 503926978937e788c38b8f08d9d3dd71980918af..0f60b13f35d51d3917425df75d3f157f8b5a87c3 100644 --- a/lite/api/CMakeLists.txt +++ b/lite/api/CMakeLists.txt @@ -23,6 +23,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) add_dependencies(paddle_full_api_shared dynload_mklml) endif() + if(WIN32) + target_link_libraries(paddle_full_api_shared shlwapi.lib) + endif() endif() if(LITE_WITH_CUDA) target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") @@ -34,15 +37,20 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} + RKNPU_DEPS ${rknpu_kernels} ) + add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) - target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) - set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") - set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") - add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) - add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE}) - set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS}) - add_dependencies(paddle_full_api_shared custom_linker_map) + target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels}) + if(NOT APPLE AND NOT WIN32) + set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") + set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") + add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) + add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE}) + set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS}) + add_dependencies(paddle_full_api_shared custom_linker_map) + endif() else() if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) add_library(paddle_light_api_shared SHARED "") @@ -57,6 +65,11 @@ else() # Need to add HIAI runtime libs (libhiai.so) dependency target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) endif() + if (LITE_WITH_RKNPU) + # Need to add RKNPU runtime libs dependency + target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs}) + endif() + endif() endif() @@ -67,8 +80,11 @@ if (WITH_TESTING) CUDA_DEPS ${cuda_kernels} X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} - MLU_DEPS ${mlu_kernels}) + MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels}) + endif() if(LITE_WITH_FPGA) set(light_api_deps ${light_api_deps} ${fpga_deps}) @@ -80,6 +96,12 @@ if(LITE_WITH_BM) set(cxx_api_deps ${cxx_api_deps} ${bm_deps}) endif() +if(LITE_WITH_RKNPU) + set(light_api_deps ${light_api_deps} ${rknpu_deps}) + set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps}) +endif() + + message(STATUS "get ops ${ops}") message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get CUDA kernels ${cuda_kernels}") @@ -87,7 +109,9 @@ message(STATUS "get Host kernels ${host_kernels}") message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get OpenCL kernels ${opencl_kernels}") message(STATUS "get NPU kernels ${npu_kernels}") +message(STATUS "get APU kernels ${apu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}") +message(STATUS "get RKNPU kernels ${rknpu_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get BM kernels ${bm_kernels}") message(STATUS "get MLU kernels ${mlu_kernels}") @@ -105,6 +129,8 @@ if (NOT LITE_ON_TINY_PUBLISH) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + APU_DEPS ${apu_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels}) @@ -125,7 +151,9 @@ lite_cc_library(light_api SRCS light_api.cc ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} @@ -144,7 +172,9 @@ if(WITH_TESTING) ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels} @@ -200,7 +230,7 @@ if(WITH_TESTING) endif() if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) - set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels}) + set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels}) lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc DEPS ${lite_model_test_DEPS} @@ -246,6 +276,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) + # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc # DEPS ${lite_model_test_DEPS}) @@ -271,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH) ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} BM_DEPS ${bm_kernels}) @@ -289,6 +321,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc DEPS light_api program mir_passes paddle_api_light CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) @@ -298,6 +331,7 @@ lite_cc_test(test_apis SRCS apis_test.cc X86_DEPS ${x86_kernels} XPU_DEPS ${xpu_kernels} FPGA_DEPS ${fpga_kernels} + RKNPU_DEPS ${rknpu_kernels} BM_DEPS ${bm_kernels} MLU_DEPS ${mlu_kernels} ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model @@ -333,6 +367,8 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + APU_DEPS ${apu_kernels} + RKNPU_DEPS ${rknpu_kernels} CL_DEPS ${opencl_kernels} X86_DEPS ${x86_kernels} FPGA_DEPS ${fpga_kernels} @@ -352,8 +388,10 @@ if(NOT IOS) NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -365,8 +403,10 @@ if(NOT IOS) NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -378,8 +418,10 @@ if(NOT IOS) NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} + RKNPU_DEPS ${rknpu_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) @@ -390,7 +432,9 @@ if(NOT IOS) CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} MLU_DEPS ${mlu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} @@ -401,19 +445,24 @@ if(NOT IOS) ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + APU_DEPS ${apu_kernels} XPU_DEPS ${xpu_kernels} + RKNPU_DEPS ${rknpu_kernels} MLU_DEPS ${mlu_kernels} CL_DEPS ${opencl_kernels} BM_DEPS ${bm_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} CUDA_DEPS ${cuda_kernels}) + lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils ${ops} ${host_kernels} ARM_DEPS ${arm_kernels} CV_DEPS paddle_cv_arm NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${npu_kernels} XPU_DEPS ${xpu_kernels} + APU_DEPS ${apu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} X86_DEPS ${x86_kernels} diff --git a/lite/api/benchmark.cc b/lite/api/benchmark.cc index 0843faf0d6b060a5b76a850de069b1dbf714da19..0ce7f6f0d5aa5bb5c7bc66dbeddaa618fa6466e6 100644 --- a/lite/api/benchmark.cc +++ b/lite/api/benchmark.cc @@ -13,7 +13,13 @@ // limitations under the License. #include +#if !defined(_WIN32) #include +#else +#include +#include "lite/backends/x86/port.h" +#endif +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include #include #include @@ -27,6 +33,9 @@ #include "lite/utils/cp_logging.h" #include "lite/utils/string.h" +DEFINE_string(optimized_model_path, + "", + "the path of the model that is optimized by opt."); DEFINE_string(model_dir, "", "the path of the model, the model and param files is under " @@ -61,10 +70,7 @@ DEFINE_int32(threads, 1, "threads num"); DEFINE_string(result_filename, "result.txt", "save the inference time to the file."); -DEFINE_bool(run_model_optimize, - false, - "if set true, apply model_optimize_tool to " - "model and use optimized model to test. "); +DEFINE_bool(show_output, false, "Wether to show the output in shell."); namespace paddle { namespace lite_api { @@ -100,15 +106,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) { LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; } +int64_t ShapeProduction(const std::vector& shape) { + int64_t num = 1; + for (auto i : shape) { + num *= i; + } + return num; +} + #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK void Run(const std::vector& input_shape, - const std::string& model_dir, + const std::string& model_path, const std::string model_name) { // set config and create predictor lite_api::MobileConfig config; config.set_threads(FLAGS_threads); config.set_power_mode(static_cast(FLAGS_power_mode)); - config.set_model_from_file(model_dir + ".nb"); + config.set_model_from_file(model_path); auto predictor = lite_api::CreatePaddlePredictor(config); @@ -116,10 +130,7 @@ void Run(const std::vector& input_shape, auto input_tensor = predictor->GetInput(0); input_tensor->Resize(input_shape); auto input_data = input_tensor->mutable_data(); - int input_num = 1; - for (size_t i = 0; i < input_shape.size(); ++i) { - input_num *= input_shape[i]; - } + int64_t input_num = ShapeProduction(input_shape); if (FLAGS_input_img_path.empty()) { for (int i = 0; i < input_num; ++i) { input_data[i] = 1.f; @@ -167,26 +178,78 @@ void Run(const std::vector& input_shape, ofs << "average = " << std::setw(12) << avg_res; ofs << std::endl; ofs.close(); + + if (FLAGS_show_output) { + auto out_tensor = predictor->GetOutput(0); + auto* out_data = out_tensor->data(); + int64_t output_num = ShapeProduction(out_tensor->shape()); + float max_value = out_data[0]; + int max_index = 0; + for (int i = 0; i < output_num; i++) { + if (max_value < out_data[i]) { + max_value = out_data[i]; + max_index = i; + } + } + LOG(INFO) << "max_value:" << max_value; + LOG(INFO) << "max_index:" << max_index; + LOG(INFO) << "output data[0:10]:"; + for (int i = 0; i < 10; i++) { + LOG(INFO) << out_data[i]; + } + } } #endif } // namespace lite_api } // namespace paddle +void print_usage() { + std::string help_info = + "Usage: \n" + "./benchmark_bin \n" + " --optimized_model_path (The path of the model that is optimized\n" + " by opt. If the model is optimized, please set the param.) \n" + " type: string \n" + " --model_dir (The path of the model that is not optimized by opt,\n" + " the model and param files is under model_dir.) type: string \n" + " --model_filename (The filename of model file. When the model is\n " + " combined formate, please set model_file. Otherwise, it is not\n" + " necessary to set it.) type: string \n" + " --param_filename (The filename of param file, set param_file when\n" + " the model is combined formate. Otherwise, it is not necessary\n" + " to set it.) type: string \n" + " --input_shape (Set input shapes according to the model, separated by\n" + " colon and comma, such as 1,3,244,244) type: string\n" + " default: 1,3,224,224 \n" + " --input_img_path (The path of input image, if not set\n" + " input_img_path, the input will be 1.0.) type: string \n " + " --power_mode (Arm power mode: 0 for big cluster, 1 for little\n" + " cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n" + " --repeats (Repeats times) type: int32 default: 1 \n" + " --result_filename (Save the inference time to the file.) type: \n" + " string default: result.txt \n" + " --threads (Threads num) type: int32 default: 1 \n" + " --warmup (Warmup times) type: int32 default: 0 \n" + "Note that: \n" + " If load the optimized model, set optimized_model_path. Otherwise, \n" + " set model_dir, model_filename and param_filename according to \n" + " the model. \n"; + LOG(INFO) << help_info; +} + int main(int argc, char** argv) { + // Check inputs gflags::ParseCommandLineFlags(&argc, &argv, true); - if (FLAGS_model_dir == "") { - LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage."; + bool is_opt_model = (FLAGS_optimized_model_path != ""); + bool is_origin_model = (FLAGS_model_dir != ""); + if (!is_origin_model && !is_opt_model) { + LOG(INFO) << "Input error, the model path should not be empty.\n"; + print_usage(); exit(0); } - if (FLAGS_model_dir.back() == '/') { - FLAGS_model_dir.pop_back(); - } - std::size_t found = FLAGS_model_dir.find_last_of("/"); - std::string model_name = FLAGS_model_dir.substr(found + 1); - std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2"; - + // Get input shape auto get_shape = [](const std::string& str_shape) -> std::vector { std::vector shape; std::string tmp_str = str_shape; @@ -202,19 +265,31 @@ int main(int argc, char** argv) { } return shape; }; - std::vector input_shape = get_shape(FLAGS_input_shape); - // Output optimized model if needed - if (FLAGS_run_model_optimize) { - paddle::lite_api::OutputOptModel(save_optimized_model_dir); + // Get model_name and run_model_path + std::string model_name; + std::string run_model_path; + if (is_origin_model) { + if (FLAGS_model_dir.back() == '/') { + FLAGS_model_dir.pop_back(); + } + std::size_t found = FLAGS_model_dir.find_last_of("/"); + model_name = FLAGS_model_dir.substr(found + 1); + std::string optimized_model_path = FLAGS_model_dir + "_opt2"; + paddle::lite_api::OutputOptModel(optimized_model_path); + run_model_path = optimized_model_path + ".nb"; + } else { + size_t found1 = FLAGS_optimized_model_path.find_last_of("/"); + size_t found2 = FLAGS_optimized_model_path.find_last_of("."); + size_t len = found2 - found1 - 1; + model_name = FLAGS_optimized_model_path.substr(found1 + 1, len); + run_model_path = FLAGS_optimized_model_path; } #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK - // Run inference using optimized model - std::string run_model_dir = - FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir; - paddle::lite_api::Run(input_shape, run_model_dir, model_name); + // Run test + paddle::lite_api::Run(input_shape, run_model_path, model_name); #endif return 0; } diff --git a/lite/api/cxx_api.cc b/lite/api/cxx_api.cc index 3f3428b434e98ffb0ba578ef7f31a4fbcd9ca619..f4dcac519a0699cbcf1bdd3845d8ae90d7a289ed 100644 --- a/lite/api/cxx_api.cc +++ b/lite/api/cxx_api.cc @@ -292,9 +292,10 @@ void Predictor::Build(const cpp::ProgramDesc &desc, program_desc_ = desc; // `inner_places` is used to optimize passes std::vector inner_places = valid_places; - inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); - inner_places.emplace_back( - TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); + for (auto &valid_place : valid_places) { + inner_places.emplace_back( + Place(TARGET(kHost), valid_place.precision, valid_place.layout)); + } // Analysis whether the modle is quantized. // For quantized model, add place(arm, int8) to inner_places diff --git a/lite/api/cxx_api_impl.cc b/lite/api/cxx_api_impl.cc index ccd7c981385ff776c47c01fbfdd058001341dff6..28e87dca394ba06844269746c19a892c26e0c653 100644 --- a/lite/api/cxx_api_impl.cc +++ b/lite/api/cxx_api_impl.cc @@ -20,24 +20,32 @@ #include "lite/core/device_info.h" #include "lite/core/version.h" +#ifndef LITE_ON_TINY_PUBLISH +#include "lite/api/paddle_use_passes.h" +#endif + #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ - !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) + !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__) #include #include "lite/backends/x86/mklml.h" #endif - namespace paddle { namespace lite { void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { config_ = config; auto places = config.valid_places(); + std::vector passes{}; #ifdef LITE_WITH_CUDA // if kCUDA is included in valid places, it should be initialized first, // otherwise skip this step. for (auto &p : places) { if (p.target == TARGET(kCUDA)) { Env::Init(); + if (config_.multi_stream()) { + passes = {"multi_stream_analysis_pass"}; + VLOG(3) << "add pass: " << passes[0]; + } break; } } @@ -51,7 +59,6 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { config.mlu_first_conv_std(), config.mlu_input_layout()); #endif // LITE_WITH_MLU - std::vector passes{}; auto use_layout_preprocess_pass = config.model_dir().find("OPENCL_PRE_PRECESS"); VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass; @@ -63,9 +70,8 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { raw_predictor_.Build(config, places, passes); mode_ = config.power_mode(); threads_ = config.threads(); - #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ - !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) + !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__) int num_threads = config.x86_math_library_num_threads(); int real_num_threads = num_threads > 1 ? num_threads : 1; paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads); diff --git a/lite/api/light_api.cc b/lite/api/light_api.cc index d82869dbef00929b70a87e05b91ef4a82630bbbe..65ce77276afdb4c3b7a7247cdb8ae120497d8145 100644 --- a/lite/api/light_api.cc +++ b/lite/api/light_api.cc @@ -29,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file, LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_); } + // For weight quantization of post training, load the int8/16 weights + // for optimized model, and dequant it to fp32. DequantizeWeight(); + BuildRuntimeProgram(cpp_program_desc_); PrepareFeedFetch(); } @@ -79,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) { if (element == input_names_.end()) { LOG(ERROR) << "Model do not have input named with: [" << name << "], model's inputs include:"; - for (int i = 0; i < input_names_.size(); i++) { + for (size_t i = 0; i < input_names_.size(); i++) { LOG(ERROR) << "[" << input_names_[i] << "]"; } return nullptr; @@ -111,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() { auto current_block = cpp_program_desc_.GetBlock(0); std::vector feeds; std::vector fetchs; - for (int i = 0; i < current_block->OpsSize(); i++) { + for (size_t i = 0; i < current_block->OpsSize(); i++) { auto op = current_block->GetOp(i); if (op->Type() == "feed") { feeds.push_back(op); @@ -121,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() { } input_names_.resize(feeds.size()); output_names_.resize(fetchs.size()); - for (int i = 0; i < feeds.size(); i++) { + for (size_t i = 0; i < feeds.size(); i++) { input_names_[feeds[i]->GetAttr("col")] = feeds[i]->Output("Out").front(); } - for (int i = 0; i < fetchs.size(); i++) { + for (size_t i = 0; i < fetchs.size(); i++) { output_names_[fetchs[i]->GetAttr("col")] = fetchs[i]->Input("X").front(); } @@ -138,9 +141,6 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { // 2. Create Instructs #ifdef LITE_WITH_OPENCL - using WaitListType = - std::unordered_map(nullptr)), - std::shared_ptr>; using OpenCLContext = Context; std::unique_ptr local_ctx(new KernelContext()); local_ctx->As().InitOnce(); @@ -182,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { } void LightPredictor::DequantizeWeight() { -#define PROCESS_CONV2D_DATA() \ - for (int64_t i = 0; i < h; ++i) { \ - for (int64_t j = 0; j < w; ++j) { \ - fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \ - } \ +#define PROCESS_CONV2D_DATA() \ + for (int64_t i = 0; i < ch; ++i) { \ + for (int64_t j = 0; j < offset; ++j) { \ + fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \ + } \ } -#define PROCESS_FC_DATA() \ - for (int i = 0; i < input_tensor->numel(); i++) { \ - *fp_data = scale_list[0] * (*int_data); \ - ++fp_data; \ - ++int_data; \ +#define PROCESS_FC_DATA() \ + for (int64_t i = 0; i < chin; i++) { \ + for (int64_t j = 0; j < chout; j++) { \ + fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \ + } \ } + auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) { + bool result = false; + if (op_desc->HasAttr("quantization_type")) { + std::string type = op_desc->GetAttr("quantization_type"); + result = (type == "post_weight_abs_max") || + (type == "post_weight_channel_wise_abs_max"); + } else { + result = op_desc->HasAttr("quantize_weight_bits"); + } + return result; + }; + Tensor tmp_tensor; - CHECK(cpp_program_desc_.BlocksSize()); - auto* main_block = cpp_program_desc_.GetBlock(0); - for (size_t k = 0; k < main_block->OpsSize(); ++k) { - auto* op_desc = main_block->GetOp(k); - if (op_desc->HasAttr("quantize_weight_bits")) { // weight quantized op - auto input_names = op_desc->input_vars(); - for (auto& input_name : input_names) { - std::string input_scale_name = input_name + "_quant_scale"; - if (op_desc->HasAttr(input_scale_name)) { // the input is quantized - auto input_tensor = - scope_->FindVar(input_name)->GetMutable(); - tmp_tensor.CopyDataFrom(*input_tensor); - auto scale_list = - op_desc->GetAttr>(input_scale_name); - int quantize_weight_bits = - op_desc->GetAttr("quantize_weight_bits"); - float* fp_data = input_tensor->mutable_data(); - - std::string op_type = op_desc->Type(); - if (op_type == "conv2d" || op_type == "depthwise_conv2d") { - int64_t h = input_tensor->dims()[0]; - int64_t w = input_tensor->numel() / h; - CHECK_EQ(scale_list.size(), h); - if (quantize_weight_bits == 8) { - const int8_t* int_data = tmp_tensor.data(); - PROCESS_CONV2D_DATA() - } else { - const int16_t* int_data = tmp_tensor.data(); - PROCESS_CONV2D_DATA() - } - } else if (op_type == "fc" || op_type == "mul") { - if (quantize_weight_bits == 8) { - const int8_t* int_data = tmp_tensor.data(); - PROCESS_FC_DATA() - } else { - const int16_t* int_data = tmp_tensor.data(); - PROCESS_FC_DATA() + for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) { + auto* block = cpp_program_desc_.GetBlock(i); + for (size_t k = 0; k < block->OpsSize(); ++k) { + auto* op_desc = block->GetOp(k); + if (is_weight_quantized_op(op_desc)) { + auto input_names = op_desc->input_vars(); + for (auto& input_name : input_names) { + std::string input_scale_name = input_name + "_quant_scale"; + if (op_desc->HasAttr(input_scale_name)) { // the input is quantized + auto input_tensor = + scope_->FindVar(input_name)->GetMutable(); + tmp_tensor.CopyDataFrom(*input_tensor); + auto scale_list = + op_desc->GetAttr>(input_scale_name); + + int quantize_weight_bits = + op_desc->GetAttr("quantize_weight_bits"); + CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16); + float* fp_data = input_tensor->mutable_data(); + + std::string op_type = op_desc->Type(); + if (op_type == "conv2d" || op_type == "depthwise_conv2d") { + int64_t ch = input_tensor->dims()[0]; + int64_t offset = input_tensor->numel() / ch; + CHECK_EQ(scale_list.size(), ch); + if (quantize_weight_bits == 8) { + const int8_t* int_data = tmp_tensor.data(); + PROCESS_CONV2D_DATA() + } else { + const int16_t* int_data = tmp_tensor.data(); + PROCESS_CONV2D_DATA() + } + } else if (op_type == "fc" || op_type == "mul") { + int64_t chin = input_tensor->dims()[0]; + int64_t chout = input_tensor->dims()[1]; + CHECK_EQ(scale_list.size(), chout); + if (quantize_weight_bits == 8) { + const int8_t* int_data = tmp_tensor.data(); + PROCESS_FC_DATA() + } else { + const int16_t* int_data = tmp_tensor.data(); + PROCESS_FC_DATA() + } } } } diff --git a/lite/api/light_api_test.cc b/lite/api/light_api_test.cc index b49ff8b80c936b93acd630c6e0cde03df8b22ee4..08779c0b5c9802ebc5095241b2543d8724981dff 100644 --- a/lite/api/light_api_test.cc +++ b/lite/api/light_api_test.cc @@ -37,11 +37,11 @@ TEST(LightAPI, load) { const std::vector inputs = predictor.GetInputNames(); LOG(INFO) << "input size: " << inputs.size(); - for (int i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < inputs.size(); i++) { LOG(INFO) << "inputnames: " << inputs[i]; } const std::vector outputs = predictor.GetOutputNames(); - for (int i = 0; i < outputs.size(); i++) { + for (size_t i = 0; i < outputs.size(); i++) { LOG(INFO) << "outputnames: " << outputs[i]; } diff --git a/lite/api/lite_multithread_test.cc b/lite/api/lite_multithread_test.cc index 12559d171ff3df808cf252e8e09c652246902abf..33c0a94cf1a254e42c47aa462c5cfe12e386a87e 100644 --- a/lite/api/lite_multithread_test.cc +++ b/lite/api/lite_multithread_test.cc @@ -293,13 +293,13 @@ int main(int argc, char** argv) { std::vector str_input_shapes = split_string(FLAGS_input_shape); std::vector> input_shapes; - for (int i = 0; i < str_input_shapes.size(); ++i) { + for (size_t i = 0; i < str_input_shapes.size(); ++i) { input_shapes.push_back(get_shape(str_input_shapes[i])); } std::vector str_input_shapes_0 = split_string(FLAGS_input_shape_0); std::vector> input_shapes_0; - for (int i = 0; i < str_input_shapes_0.size(); ++i) { + for (size_t i = 0; i < str_input_shapes_0.size(); ++i) { input_shapes_0.push_back(get_shape(str_input_shapes_0[i])); } diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc index b0f7a0479f0db91b816838f9d0ee1cc31b9b232a..f61ed9b4c38fcc3a6fe33fd26d6d3a80edcb9373 100644 --- a/lite/api/model_test.cc +++ b/lite/api/model_test.cc @@ -44,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir, const std::vector>& input_shapes) { lite_api::CxxConfig config; config.set_model_dir(load_model_dir); +#ifdef LITE_WITH_X86 + config.set_valid_places({Place{TARGET(kX86), PRECISION(kFloat)}, + Place{TARGET(kX86), PRECISION(kInt64)}, + Place{TARGET(kHost), PRECISION(kFloat)}}); +#else config.set_valid_places({ Place{TARGET(kARM), PRECISION(kFloat)}, }); +#endif auto predictor = lite_api::CreatePaddlePredictor(config); // delete old optimized model @@ -198,7 +204,7 @@ int main(int argc, char** argv) { LOG(INFO) << "input shapes: " << FLAGS_input_shape; std::vector str_input_shapes = split_string(FLAGS_input_shape); std::vector> input_shapes; - for (int i = 0; i < str_input_shapes.size(); ++i) { + for (size_t i = 0; i < str_input_shapes.size(); ++i) { LOG(INFO) << "input shape: " << str_input_shapes[i]; input_shapes.push_back(get_shape(str_input_shapes[i])); } diff --git a/lite/api/model_test_classify.cc b/lite/api/model_test_classify.cc index 375d249476bf5323d69ea41c3f11d07e9c8bc711..5d2011e29bfdeb166ae1ad202d96a204893888b0 100644 --- a/lite/api/model_test_classify.cc +++ b/lite/api/model_test_classify.cc @@ -310,7 +310,7 @@ int main(int argc, char** argv) { LOG(INFO) << "input shapes: " << FLAGS_input_shape; std::vector str_input_shapes = split_string(FLAGS_input_shape); std::vector> input_shapes; - for (int i = 0; i < str_input_shapes.size(); ++i) { + for (size_t i = 0; i < str_input_shapes.size(); ++i) { LOG(INFO) << "input shape: " << str_input_shapes[i]; input_shapes.push_back(get_shape(str_input_shapes[i])); } diff --git a/lite/api/model_test_detection.cc b/lite/api/model_test_detection.cc index f9be12b2c78c623a2b2c9852850576cc11815bd3..f059aca6330613f66fa93267c0c594cfba6d8833 100644 --- a/lite/api/model_test_detection.cc +++ b/lite/api/model_test_detection.cc @@ -114,7 +114,7 @@ void detect_object(const float* dout, } std::string name = FLAGS_out_txt + "_accu.txt"; FILE* fp = fopen(name.c_str(), "w"); - for (int i = 0; i < objects.size(); ++i) { + for (size_t i = 0; i < objects.size(); ++i) { Object object = objects.at(i); if (object.prob > thresh && object.x > 0 && object.y > 0 && object.width > 0 && object.height > 0) { @@ -324,7 +324,7 @@ int main(int argc, char** argv) { LOG(INFO) << "input shapes: " << FLAGS_input_shape; std::vector str_input_shapes = split_string(FLAGS_input_shape); std::vector> input_shapes; - for (int i = 0; i < str_input_shapes.size(); ++i) { + for (size_t i = 0; i < str_input_shapes.size(); ++i) { LOG(INFO) << "input shape: " << str_input_shapes[i]; input_shapes.push_back(get_shape(str_input_shapes[i])); } diff --git a/lite/api/opt.cc b/lite/api/opt.cc index 7a8cd7f1ef1234269c986b781f0546b26df53c4b..a6ad7cff6f234187770eccf1501378c04201b729 100644 --- a/lite/api/opt.cc +++ b/lite/api/opt.cc @@ -104,13 +104,21 @@ std::vector ParserValidPlaces() { valid_places.emplace_back( TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel } else if (target_repr == "x86") { - valid_places.emplace_back(TARGET(kX86)); + valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)}); + valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)}); } else if (target_repr == "npu") { valid_places.emplace_back(TARGET(kNPU)); } else if (target_repr == "xpu") { valid_places.emplace_back(TARGET(kXPU)); } else if (target_repr == "mlu") { valid_places.emplace_back(TARGET(kMLU)); + } else if (target_repr == "rknpu") { + valid_places.emplace_back(TARGET(kRKNPU)); + valid_places.emplace_back( + TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)); + } else if (target_repr == "apu") { + valid_places.emplace_back( + Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " @@ -187,6 +195,8 @@ void PrintOpsInfo(std::set valid_ops = {}) { "kFPGA", "kNPU", "kXPU", + "kRKNPU", + "kAPU", "kAny", "kUnk"}; int maximum_optype_length = 0; @@ -251,16 +261,16 @@ void PrintHelpInfo() { " `--param_file=`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out=`\n" - " `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" + " `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " `--record_tailoring_info=(true|false)`\n" " Arguments of model checking and ops information:\n" " `--print_all_ops=true` Display all the valid operators of " "Paddle-Lite\n" " `--print_supported_ops=true " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" " Display valid operators of input targets\n" " `--print_model_ops=true --model_dir= " - "--valid_targets=(arm|opencl|x86|npu|xpu)`" + "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`" " Display operators in the input model\n"; std::cout << "opt version:" << opt_version << std::endl << help_info << std::endl; diff --git a/lite/api/opt_base.cc b/lite/api/opt_base.cc index bd86f486248a2daccde13da078ae3860d8e31169..14c1ca4a4e9c19d2d3c27b783267682457eeddb2 100644 --- a/lite/api/opt_base.cc +++ b/lite/api/opt_base.cc @@ -63,6 +63,13 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { valid_places_.emplace_back(TARGET(kNPU)); } else if (target_repr == "xpu") { valid_places_.emplace_back(TARGET(kXPU)); + } else if (target_repr == "rknpu") { + valid_places_.emplace_back(TARGET(kRKNPU)); + valid_places_.emplace_back( + TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)); + } else if (target_repr == "apu") { + valid_places_.emplace_back( + Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)}); } else { LOG(FATAL) << lite::string_format( "Wrong target '%s' found, please check the command flag " @@ -183,7 +190,7 @@ void OptBase::PrintHelpInfo() { " `set_param_file(param_file_path)`\n" " `set_model_type(protobuf|naive_buffer)`\n" " `set_optimize_out(output_optimize_model_dir)`\n" - " `set_valid_places(arm|opencl|x86|npu|xpu)`\n" + " `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n" " `run_optimize(false|true)`\n" " ` ----fasle&true refer to whether to record ops info for " "tailoring lib, false by default`\n" @@ -208,6 +215,8 @@ void OptBase::PrintOpsInfo(const std::set& valid_ops) { "kFPGA", "kNPU", "kXPU", + "kRKNPU", + "kAPU", "kAny", "kUnk"}; // Get the lengh of the first column: maximum length of the op_type diff --git a/lite/api/paddle_api.h b/lite/api/paddle_api.h index ce0f0e15d84835fab733a5114906e0a0df3a0064..79ab98da799a99540217d55e3d40b46800f17626 100644 --- a/lite/api/paddle_api.h +++ b/lite/api/paddle_api.h @@ -136,6 +136,9 @@ class LITE_API CxxConfig : public ConfigBase { #ifdef LITE_WITH_X86 int x86_math_library_math_threads_ = 1; #endif +#ifdef LITE_WITH_CUDA + bool multi_stream_{false}; +#endif #ifdef LITE_WITH_MLU lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; int mlu_core_number_{1}; @@ -171,6 +174,10 @@ class LITE_API CxxConfig : public ConfigBase { return x86_math_library_math_threads_; } #endif +#ifdef LITE_WITH_CUDA + void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } + int multi_stream() const { return multi_stream_; } +#endif #ifdef LITE_WITH_MLU // set MLU core version, which is used when compiling MLU kernels diff --git a/lite/api/paddle_api_test.cc b/lite/api/paddle_api_test.cc index 9b8384f2823ee121aa8bb505dd135735d9f96774..832867df079efa1baebf08da4c0d8e37958460f1 100644 --- a/lite/api/paddle_api_test.cc +++ b/lite/api/paddle_api_test.cc @@ -36,11 +36,11 @@ TEST(CxxApi, run) { auto inputs = predictor->GetInputNames(); LOG(INFO) << "input size: " << inputs.size(); - for (int i = 0; i < inputs.size(); i++) { + for (size_t i = 0; i < inputs.size(); i++) { LOG(INFO) << "inputnames: " << inputs[i]; } auto outputs = predictor->GetOutputNames(); - for (int i = 0; i < outputs.size(); i++) { + for (size_t i = 0; i < outputs.size(); i++) { LOG(INFO) << "outputnames: " << outputs[i]; } auto input_tensor = predictor->GetInputByName(inputs[0]); diff --git a/lite/api/paddle_lite_factory_helper.h b/lite/api/paddle_lite_factory_helper.h index e99127e233bc4adf159a6a567dfb15f6fd784a27..9dc5c9e857243ecb57f785737b00929e36c5d83c 100644 --- a/lite/api/paddle_lite_factory_helper.h +++ b/lite/api/paddle_lite_factory_helper.h @@ -18,20 +18,21 @@ */ #pragma once -#define USE_LITE_OP(op_type__) \ - extern int touch_op_##op_type__(); \ - int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \ - touch_op_##op_type__(); +// some platform-independent defintion +#include "lite/utils/macros.h" + +#define USE_LITE_OP(op_type__) \ + extern int touch_op_##op_type__(); \ + int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__(); #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \ extern int touch_##op_type__##target__##precision__##layout__##alias__(); \ int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \ - __attribute__((unused)) = \ - touch_##op_type__##target__##precision__##layout__##alias__(); + UNUSED = touch_##op_type__##target__##precision__##layout__##alias__(); -#define USE_MIR_PASS(name__) \ - extern bool mir_pass_registry##name__##_fake(); \ - static bool mir_pass_usage##name__ __attribute__((unused)) = \ +#define USE_MIR_PASS(name__) \ + extern bool mir_pass_registry##name__##_fake(); \ + static bool mir_pass_usage##name__ UNUSED = \ mir_pass_registry##name__##_fake(); #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__ diff --git a/lite/api/paddle_place.cc b/lite/api/paddle_place.cc index aceb047b64f54ac18ac492ef495d32c3180ad4b4..3cef9563d89cd5b21dbdcb0c4ccf1504e7d311b3 100644 --- a/lite/api/paddle_place.cc +++ b/lite/api/paddle_place.cc @@ -72,7 +72,9 @@ const std::string& TargetToStr(TargetType target) { "npu", "xpu", "bm", - "mlu"}; + "mlu", + "rknpu", + "apu"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -112,8 +114,10 @@ const std::string& TargetRepr(TargetType target) { "kFPGA", "kNPU", "kXPU", + "kBM", "kMLU", - "kBM"}; + "kRKNPU", + "kAPU"}; auto x = static_cast(target); CHECK_LT(x, static_cast(TARGET(NUM))); return target2string[x]; @@ -156,6 +160,7 @@ std::set ExpandValidTargets(TargetType target) { TARGET(kXPU), TARGET(kBM), TARGET(kMLU), + TARGET(kAPU), TARGET(kFPGA)}); if (target == TARGET(kAny)) { return valid_set; diff --git a/lite/api/paddle_place.h b/lite/api/paddle_place.h index f57b9832f2b35fc3db74232192bd397ec8b4930c..7066656f18ec0693048223f5f1201e77a1b0a37d 100644 --- a/lite/api/paddle_place.h +++ b/lite/api/paddle_place.h @@ -49,13 +49,15 @@ enum class TargetType : int { kCUDA = 3, kARM = 4, kOpenCL = 5, + kAny = 6, // any target kFPGA = 7, kNPU = 8, kXPU = 9, kBM = 10, kMLU = 11, - kAny = 6, // any target - NUM = 12, // number of fields. + kRKNPU = 12, + kAPU = 13, + NUM = 14, // number of fields. }; enum class PrecisionType : int { kUnk = 0, diff --git a/lite/api/paddle_use_passes.h b/lite/api/paddle_use_passes.h index 219952bd2aa440c81b116d9ae8aaba0920268eb5..82cd7f3d8da5eb4f00c9069731960a81ef9fe87d 100644 --- a/lite/api/paddle_use_passes.h +++ b/lite/api/paddle_use_passes.h @@ -42,12 +42,14 @@ USE_MIR_PASS(type_precision_cast_pass); USE_MIR_PASS(type_layout_cast_pass); USE_MIR_PASS(type_layout_cast_preprocess_pass); USE_MIR_PASS(memory_optimize_pass); +USE_MIR_PASS(multi_stream_analysis_pass); USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(mlu_subgraph_pass); USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass); +USE_MIR_PASS(apu_subgraph_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); diff --git a/lite/api/python/CMakeLists.txt b/lite/api/python/CMakeLists.txt index ba0c6eb2404ce1ffc2ad5950ee5a3476d42f01b8..5dfecf8c619d8cf9be7a03fa46b4e86a6e641a29 100644 --- a/lite/api/python/CMakeLists.txt +++ b/lite/api/python/CMakeLists.txt @@ -17,8 +17,12 @@ execute_process( OUTPUT_VARIABLE PADDLE_LITE_COMMIT OUTPUT_STRIP_TRAILING_WHITESPACE ) -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in - ${CMAKE_CURRENT_BINARY_DIR}/setup.py) - +if(APPLE) + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in + ${CMAKE_CURRENT_BINARY_DIR}/setup.py) +else() + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in + ${CMAKE_CURRENT_BINARY_DIR}/setup.py) +endif() add_subdirectory(pybind) #add_subdirectory(interface) diff --git a/lite/api/python/__init__.py b/lite/api/python/__init__.py index abf198b97e6e818e1fbe59006f98492640bcee54..72a75d9caaa79fa96e52e8603ae6886aac341009 100644 --- a/lite/api/python/__init__.py +++ b/lite/api/python/__init__.py @@ -11,3 +11,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +import os +import sys + +if os.name =='nt': + current_path = os.path.abspath(os.path.dirname(__file__)) + third_lib_path = current_path + os.sep + 'libs' + os.environ['path'] = third_lib_path+ ';' + os.environ['path'] + sys.path.insert(0, third_lib_path) diff --git a/lite/api/python/pybind/CMakeLists.txt b/lite/api/python/pybind/CMakeLists.txt index b1de18d50c1582b0f872ad38d24939665ab1d3b0..fe4cdb5a73d62afa98fb8c343e8a6a20388e293b 100644 --- a/lite/api/python/pybind/CMakeLists.txt +++ b/lite/api/python/pybind/CMakeLists.txt @@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH) set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base) endif() -lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) +if(WIN32) + lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) + get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) + target_link_libraries(lite_pybind ${os_dependency_modules}) +else() + lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) +endif(WIN32) + if (LITE_ON_TINY_PUBLISH) set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") endif() diff --git a/lite/api/python/pybind/pybind.cc b/lite/api/python/pybind/pybind.cc index 5512e7bc438eddd6bcd9c8f792fc8507b03bf800..06d1c607fd761f9f6e58a4c5779e2c3cb9f4e6b3 100644 --- a/lite/api/python/pybind/pybind.cc +++ b/lite/api/python/pybind/pybind.cc @@ -183,6 +183,8 @@ void BindLitePlace(py::module *m) { .value("FPGA", TargetType::kFPGA) .value("NPU", TargetType::kNPU) .value("MLU", TargetType::kMLU) + .value("RKNPU", TargetType::kRKNPU) + .value("APU", TargetType::kAPU) .value("Any", TargetType::kAny); // PrecisionType diff --git a/lite/api/python/setup.py.in b/lite/api/python/setup.py.in index 79028fb7493bf55eab74aa76ee51ac79f418ba0a..b04a6077f5aafecf76fed0b0dee5c56919b9302e 100644 --- a/lite/api/python/setup.py.in +++ b/lite/api/python/setup.py.in @@ -34,20 +34,27 @@ else: # core lib of paddlelite is stored as lite.so LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' -PACKAGE_DATA = {'paddlelite': ['lite.so']} +PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']} # put all thirdparty libraries in paddlelite.libs PACKAGE_DATA['paddlelite.libs'] = [] LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' if '${WITH_MKL}' == 'ON': shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) - PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] - + if os.name != 'nt': + PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] + else: + PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll'] + shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH) + PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll'] # link lite.so to paddlelite.libs -COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ -/inference_lite_lib/python/install/lite/lite.so" -if os.system(COMMAND) != 0: - raise Exception("patch third_party libs failed, command: %s" % COMMAND) +if os.name != 'nt': + COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ + /inference_lite_lib/python/install/lite/lite.so" + if os.system(COMMAND) != 0: + raise Exception("patch third_party libs failed, command: %s" % COMMAND) + + # remove unused paddle/libs/__init__.py if os.path.isfile(LIB_PATH+'/__init__.py'): @@ -61,6 +68,14 @@ PACKAGE_DIR = { 'paddlelite': LITE_PATH } +if os.name == 'nt': + # fix the path separator under windows + fix_package_dir = {} + for k, v in PACKAGE_DIR.items(): + fix_package_dir[k] = v.replace('/', '\\') + PACKAGE_DIR = fix_package_dir + + setup( name='paddlelite', version=PADDLELITE_VERSION, diff --git a/lite/api/python/setup_mac.py.in b/lite/api/python/setup_mac.py.in new file mode 100644 index 0000000000000000000000000000000000000000..c8dfe2cc5c13b3105fc1aed404676eefd40877e8 --- /dev/null +++ b/lite/api/python/setup_mac.py.in @@ -0,0 +1,73 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# module of pack whl installer for Paddle-lite + +import shutil +import os +from setuptools import setup, Distribution + + +class BinaryDistribution(Distribution): + 'binary distribution' + def has_ext_modules(foo): + return True + + +# get paddle-lite version, if it's not based on a release tag, we use commit id instead +PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@" +PADDLELITE_TAG = "@PADDLE_LITE_TAG@" +if PADDLELITE_TAG == "": + PADDLELITE_VERSION = PADDLELITE_COMMITE +else: + PADDLELITE_VERSION = PADDLELITE_TAG + +# core lib of paddlelite is stored as lite.so +LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' +PACKAGE_DATA = {'paddlelite': ['lite.so']} +# put all thirdparty libraries in paddlelite.libs +PACKAGE_DATA['paddlelite.libs'] = [] +LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' + +if '${WITH_MKL}' == 'ON': + shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) + shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) + PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib'] + +# link lite.so to paddlelite.libs +COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\ +/inference_lite_lib/python/install/lite/lite.so" +if os.system(COMMAND) != 0: + raise Exception("patch third_party libs failed, command: %s" % COMMAND) + +# remove unused paddle/libs/__init__.py +if os.path.isfile(LIB_PATH+'/__init__.py'): + os.remove(LIB_PATH+'/__init__.py') + +# set dir path of each package +PACKAGE_DIR = { + # The paddle.fluid.proto will be generated while compiling. + # So that package points to other directory. + 'paddlelite.libs': LIB_PATH, + 'paddlelite': LITE_PATH +} + +setup( + name='paddlelite', + version=PADDLELITE_VERSION, + description='Paddle-Lite Library', + packages=['paddlelite', 'paddlelite.libs'], + package_dir=PACKAGE_DIR, + package_data=PACKAGE_DATA, + distclass=BinaryDistribution +) diff --git a/lite/api/test_googlenet_lite.cc b/lite/api/test_googlenet_lite.cc index 8ff7a49af9cbce09d205bb8633a913410beb91c3..f20714f096756da63bdb99c5bcf57b225658b096 100644 --- a/lite/api/test_googlenet_lite.cc +++ b/lite/api/test_googlenet_lite.cc @@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -69,7 +69,7 @@ TEST(CXXApi, test_lite_googlenet) { for (size_t i = 0; i < results.size(); ++i) { EXPECT_NEAR(out->data()[i * 51], results[i], 1e-5); } - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); } diff --git a/lite/api/test_helper.h b/lite/api/test_helper.h index a17fc331310cfe17ec36be504b94ddacc724e90f..fa6e20230d68c73b0720606816a4594077278d56 100644 --- a/lite/api/test_helper.h +++ b/lite/api/test_helper.h @@ -15,7 +15,12 @@ #pragma once #include +#if !defined(_WIN32) #include +#else +#include +#include "lite/backends/x86/port.h" +#endif #include #include diff --git a/lite/api/test_inceptionv4_lite_x86.cc b/lite/api/test_inceptionv4_lite_x86.cc index e986784809951390889e17f766302fc5ea459465..00f775ddb7e7bf2d2f23c34ce19e576a4d2d27ed 100644 --- a/lite/api/test_inceptionv4_lite_x86.cc +++ b/lite/api/test_inceptionv4_lite_x86.cc @@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -69,13 +69,13 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) { 0.0010612885, 0.00089107914, 0.0010112736, 0.00097655767})); auto out = predictor->GetOutput(0); - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], results[i][j], 1e-6); diff --git a/lite/api/test_mobilenetv1_lite_x86.cc b/lite/api/test_mobilenetv1_lite_x86.cc index 67dc1b2436988c7d0d853c945fecce27ef2d329f..8a7547b9031d0723c528e7dd6e8d7e3fb6201b7d 100644 --- a/lite/api/test_mobilenetv1_lite_x86.cc +++ b/lite/api/test_mobilenetv1_lite_x86.cc @@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -68,13 +68,13 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { 0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986, 0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722})); auto out = predictor->GetOutput(0); - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], results[i][j], 1e-6); diff --git a/lite/api/test_mobilenetv2_lite_x86.cc b/lite/api/test_mobilenetv2_lite_x86.cc index 95e88abcc8e59c6808ea2dc44cf7d1bdd53ac9d0..92c8182f7330a76bf55cf34fbb9e4fdba1fa2fc6 100644 --- a/lite/api/test_mobilenetv2_lite_x86.cc +++ b/lite/api/test_mobilenetv2_lite_x86.cc @@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -69,13 +69,13 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { 0.0070957416, 0.0016094646, 0.0018807327, 0.00010506048, 6.823785e-05, 0.00012269315, 0.0007806194, 0.00022354358})); auto out = predictor->GetOutput(0); - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], results[i][j], 1e-6); diff --git a/lite/api/test_resnet50_lite_x86.cc b/lite/api/test_resnet50_lite_x86.cc index 3f9b59d714de611ef0a84cfc3b283d0dddd5c294..b185159801b6264555367b41f7def1bd0e7a5a3f 100644 --- a/lite/api/test_resnet50_lite_x86.cc +++ b/lite/api/test_resnet50_lite_x86.cc @@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) { input_tensor->Resize(input_shape); auto* data = input_tensor->mutable_data(); int input_num = 1; - for (int i = 0; i < input_shape.size(); ++i) { + for (size_t i = 0; i < input_shape.size(); ++i) { input_num *= input_shape[i]; } for (int i = 0; i < input_num; i++) { @@ -69,13 +69,13 @@ TEST(Resnet50, test_resnet50_lite_x86) { 0.006387163, 0.0037145028, 0.0012812682, 0.00045948103, 0.00013535398, 0.0002483765, 0.00076759676, 0.0002773295})); auto out = predictor->GetOutput(0); - ASSERT_EQ(out->shape().size(), 2); + ASSERT_EQ(out->shape().size(), 2u); ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[1], 1000); int step = 50; - for (int i = 0; i < results.size(); ++i) { - for (int j = 0; j < results[i].size(); ++j) { + for (size_t i = 0; i < results.size(); ++i) { + for (size_t j = 0; j < results[i].size(); ++j) { EXPECT_NEAR(out->data()[j * step + (out->shape()[1] * i)], results[i][j], 1e-6); diff --git a/lite/api/transform_test.cc b/lite/api/transform_test.cc index e1c315f4a63ffd3ed8f51fa4b73ac88b50835cab..3cd8416d5e2293642abc68e457465c8a836f790b 100644 --- a/lite/api/transform_test.cc +++ b/lite/api/transform_test.cc @@ -232,8 +232,8 @@ void TestModel(const std::vector& valid_places, for (int i = 0; i < outs->numel(); ++i) { LOG(INFO) << o_data[i]; } - for (int i = 0; i < lod.size(); ++i) { - for (int j = 0; j < lod[i].size(); ++j) { + for (size_t i = 0; i < lod.size(); ++i) { + for (size_t j = 0; j < lod[i].size(); ++j) { LOG(INFO) << lod[i][j]; } } diff --git a/lite/backends/CMakeLists.txt b/lite/backends/CMakeLists.txt index fb459ae3621d1281f0a2433ca6b237a165d078a1..7f0d53f976ace17ee8d95e62e62d56f5cb974881 100644 --- a/lite/backends/CMakeLists.txt +++ b/lite/backends/CMakeLists.txt @@ -8,3 +8,5 @@ add_subdirectory(npu) add_subdirectory(xpu) add_subdirectory(mlu) add_subdirectory(bm) +add_subdirectory(apu) +add_subdirectory(rknpu) diff --git a/lite/backends/apu/CMakeLists.txt b/lite/backends/apu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..68d77a401f541fa56b2b53ea9a99619f1baafb42 --- /dev/null +++ b/lite/backends/apu/CMakeLists.txt @@ -0,0 +1,5 @@ +if(NOT LITE_WITH_APU) + return() +endif() + +lite_cc_library(device_apu SRCS device.cc) diff --git a/lite/backends/apu/device.cc b/lite/backends/apu/device.cc new file mode 100644 index 0000000000000000000000000000000000000000..27cde9f6efd45a20649b8ff3d4f5ff3b2220aa2d --- /dev/null +++ b/lite/backends/apu/device.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/apu/device.h" +#include +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace apu { + +inline void* LoadFunc(void* libHandle, const char* name) { + CHECK(libHandle != nullptr); + CHECK(name != nullptr); + void* fn = dlsym(libHandle, name); + if (fn == nullptr) { + LOG(WARNING) << "Unable to open Neuron Runtime function [" << name + << "] Because " << dlerror(); + } + return fn; +} + +NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) { + typedef int (*NeuronCompilation_create)(NeuronModel * model, + NeuronCompilation * *compilation); + typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation); + typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation); + +#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \ + FUNC_NAME VARIABLE_NAME = \ + reinterpret_cast(LoadFunc(libHandle, #FUNC_NAME)); + LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create) + LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free) + LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish) +#undef LOAD_FUNCTIONS + + int neuron_errCode = 0; + NeuronCompilation* compilation = NULL; + + VLOG(3) << "[APU] Compile model"; + + neuron_errCode = (*neuron_compilation_create)(model, &compilation); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode; + return nullptr; + } + + neuron_errCode = (*neuron_compilation_finish)(compilation); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "[APU] compile failed! " << neuron_errCode; + return nullptr; + } + + VLOG(3) << "[APU] Build done"; + return compilation; +} + +} // namespace apu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/apu/device.h b/lite/backends/apu/device.h new file mode 100644 index 0000000000000000000000000000000000000000..f332512bcb2d5ec9558be0be5694a0623560494c --- /dev/null +++ b/lite/backends/apu/device.h @@ -0,0 +1,40 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "NeuronAdapter.h" // NOLINT + +namespace paddle { +namespace lite { +namespace apu { + +class Device { + public: + static Device& Global() { + static Device x; + return x; + } + Device() {} + + NeuronCompilation* Build(void* libHandle, NeuronModel* model); +}; + +} // namespace apu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/arm/math/activation.cc b/lite/backends/arm/math/activation.cc index 26e63e23f6acb761b61b397bb881d425e3442468..1d01642100109d14a413ad5e274606c88bf0005a 100644 --- a/lite/backends/arm/math/activation.cc +++ b/lite/backends/arm/math/activation.cc @@ -744,6 +744,15 @@ void act_reciprocal(const float* din, } } +template <> +void act_abs(const float* din, float* dout, int size, int threads) { + for (int i = 0; i < size; ++i) { + dout[0] = (din[0] > 0 ? din[0] : -din[0]); + din++; + dout++; + } +} + #ifdef LITE_WITH_TRAIN template <> void act_square_grad(const float* din, diff --git a/lite/backends/arm/math/activation.h b/lite/backends/arm/math/activation.h index ca6b146442a3ec324a9bd244ee4ce6ad0601d4d7..50f60f300bbab9b9f0bcad222f31699b7bfadeab 100644 --- a/lite/backends/arm/math/activation.h +++ b/lite/backends/arm/math/activation.h @@ -83,6 +83,9 @@ void act_hard_swish(const T* din, template void act_reciprocal(const T* din, T* dout, int size, int threads); +template +void act_abs(const T* din, T* dout, int size, int threads); + #ifdef LITE_WITH_TRAIN template void act_square_grad( diff --git a/lite/backends/arm/math/concat.cc b/lite/backends/arm/math/concat.cc index 65f93453388d7f41d73669f583d189bec9035bb5..e54d70ffbb119d0a91b82f67b77c9d778dea17bf 100644 --- a/lite/backends/arm/math/concat.cc +++ b/lite/backends/arm/math/concat.cc @@ -16,46 +16,3 @@ #include #include #include -#include "lite/backends/arm/math/funcs.h" - -namespace paddle { -namespace lite { -namespace arm { -namespace math { - -void concat_func(const std::vector &input, - const int axis, - lite::Tensor *output) { - int64_t concat_input_size = 1; - int64_t num_cancats = 1; - auto dim_0 = input[0]->dims(); - size_t num = input.size(); - for (int i = axis + 1; i < dim_0.size(); i++) { - concat_input_size *= dim_0[i]; - } - for (int i = 0; i < axis; i++) { - num_cancats *= dim_0[i]; - } - float *dst_ptr = output->mutable_data(); - const int out_concat_axis = output->dims()[axis]; - int64_t offset_concat_axis = 0; - int64_t out_sum = out_concat_axis * concat_input_size; - for (int n = 0; n < num; n++) { - auto dims = input[n]->dims(); - const float *src_ptr = input[n]->data(); - int64_t in_concat_axis = dims[axis]; - float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size; - int64_t in_sum = in_concat_axis * concat_input_size; - for (int i = 0; i < num_cancats; i++) { - std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum); - dout_ptr += out_sum; - src_ptr += in_sum; - } - offset_concat_axis += in_concat_axis; - } -} - -} // namespace math -} // namespace arm -} // namespace lite -} // namespace paddle diff --git a/lite/backends/arm/math/concat.h b/lite/backends/arm/math/concat.h index 4c6159e9e09b66edde812e5098e1263963f3e4da..44e8bf73e220f94dca4ba6713debfae77029867a 100644 --- a/lite/backends/arm/math/concat.h +++ b/lite/backends/arm/math/concat.h @@ -25,9 +25,39 @@ namespace lite { namespace arm { namespace math { -void concat_func(const std::vector &input, +template +void concat_func(const std::vector& input, const int axis, - lite::Tensor *output); + lite::Tensor* output) { + size_t num = input.size(); + auto dim_0 = input[0]->dims(); + int64_t concat_input_size = 1; + int64_t num_cancats = 1; + for (int i = axis + 1; i < dim_0.size(); i++) { + concat_input_size *= dim_0[i]; + } + for (int i = 0; i < axis; i++) { + num_cancats *= dim_0[i]; + } + + auto* dst_ptr = output->mutable_data(); + const int out_concat_axis = output->dims()[axis]; + int64_t offset_concat_axis = 0; + int64_t out_sum = out_concat_axis * concat_input_size; + for (int n = 0; n < num; n++) { + auto dims = input[n]->dims(); + auto* src_ptr = input[n]->data(); + int64_t in_concat_axis = dims[axis]; + auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size; + int64_t in_sum = in_concat_axis * concat_input_size; + for (int i = 0; i < num_cancats; i++) { + std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum); + dout_ptr += out_sum; + src_ptr += in_sum; + } + offset_concat_axis += in_concat_axis; + } +} } // namespace math } // namespace arm diff --git a/lite/backends/arm/math/reduce_mean.cc b/lite/backends/arm/math/reduce_mean.cc index 56104550d8d68e53ad9a2ac3148887d67480d6f6..a84eef2970b2837159609c1ded1ca0d9991ccfc6 100644 --- a/lite/backends/arm/math/reduce_mean.cc +++ b/lite/backends/arm/math/reduce_mean.cc @@ -198,6 +198,23 @@ void reduce_mean_hw(const float* src, reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in); } +template <> +void mean_grad(const float* out_grad, float* in_grad, int size) { + float grad = out_grad[0] / size; + float32x4_t grad_v = vdupq_n_f32(grad); + int loop = size >> 2; + int remain = size & 3; + +#pragma omp parallel for + for (int i = 0; i < loop; ++i) { + vst1q_f32(in_grad, grad_v); + in_grad += 4; + } + for (int i = 0; i < remain; ++i) { + in_grad[i] = grad; + } +} + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/arm/math/reduce_mean.h b/lite/backends/arm/math/reduce_mean.h index 277ed209c058b5b4be76ce18a00683610e6afb7a..aaa9ff42c18d0cfa6a7cf11408dfba06a9444adc 100644 --- a/lite/backends/arm/math/reduce_mean.h +++ b/lite/backends/arm/math/reduce_mean.h @@ -83,6 +83,9 @@ void reduce_mean_all(const T* src, int height_in, int width_in); +template +void mean_grad(const T* out_grad, T* in_grad, int size); + } // namespace math } // namespace arm } // namespace lite diff --git a/lite/backends/cuda/CMakeLists.txt b/lite/backends/cuda/CMakeLists.txt index 35f5f0ce2d93db59cbb856d8008e6f3138633e42..0689bb706ab3bac4b8b97059017181ef24dd8ee4 100644 --- a/lite/backends/cuda/CMakeLists.txt +++ b/lite/backends/cuda/CMakeLists.txt @@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES) nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps}) nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps}) + +lite_cc_library(cuda_context SRCS context.cc DEPS device_info) add_subdirectory(math) diff --git a/lite/backends/cuda/context.cc b/lite/backends/cuda/context.cc new file mode 100644 index 0000000000000000000000000000000000000000..4bac4c442c28848d38bd434d045c7888a1a92ac8 --- /dev/null +++ b/lite/backends/cuda/context.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/cuda/context.h" + +namespace paddle { +namespace lite {} // namespace lite +} // namespace paddle diff --git a/lite/backends/cuda/context.h b/lite/backends/cuda/context.h new file mode 100644 index 0000000000000000000000000000000000000000..5bed30a9603c6f6a48169ae31d66c989bd891836 --- /dev/null +++ b/lite/backends/cuda/context.h @@ -0,0 +1,170 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "lite/backends/cuda/blas.h" +#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/target_wrapper.h" +#include "lite/core/device_info.h" + +namespace paddle { +namespace lite { + +template +class Context; + +using CUDAContext = Context; + +// Only works with CUDA kernels. +template <> +class Context { + public: + typename Env::Devs& devs = + Env::Global(); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() { + if (devs.size() > 0) { + cublas_fp32_ = std::make_shared>(); + } else { + LOG(INFO) << "No cuda device(s) found, CUDAContext init failed."; + } + } + void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { + CHECK_GT(devs.size(), 0UL) + << "Env is not initialized or current target is not exit!"; + if (dev_id >= static_cast(devs.size())) { + LOG(WARNING) << "device index exceeds the number of devices, set to " + "default device(0)!"; + device_id_ = 0; + } else { + device_id_ = dev_id; + } + if (io_stream_id >= devs[dev_id].max_stream()) { + LOG(WARNING) << "data stream index exceeds the maximum stream number, " + "set to default stream(0)!"; + io_stream_id = 0; + } + if (exec_stream_id >= devs[dev_id].max_stream()) { + LOG(WARNING) << "exec stream index exceeds the maximum stream number, " + "set to default stream(0)!"; + exec_stream_id = 0; + } + + exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id]; + io_stream_ = devs[dev_id].io_streams()[io_stream_id]; + + exec_stream_id_ = exec_stream_id; + io_stream_id_ = io_stream_id; + need_sync_ = false; + } + void CopySharedTo(CUDAContext* ctx) { + CHECK(ctx); + CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; + ctx->cublas_fp32_ = cublas_fp32_; + } + + const cudaStream_t& exec_stream() const { return exec_stream_; } + void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } + + const cudaStream_t& io_stream() const { return io_stream_; } + void SetIoStream(cudaStream_t stream) { io_stream_ = stream; } + + std::shared_ptr> cublas_fp32() { return cublas_fp32_; } + void SetCuBlasFP32(std::shared_ptr> cublas_fp32) { + cublas_fp32_ = cublas_fp32; + } + + const std::vector& input_events() { return input_events_; } + void SetInputEvents(const std::vector& input_events) { + input_events_.clear(); + input_events_.assign(input_events.begin(), input_events.end()); + } + + const std::vector& output_events() { return output_events_; } + void SetOutputEvents(const std::vector& output_events) { + output_events_.clear(); + output_events_.assign(output_events.begin(), output_events.end()); + } + + std::vector all_exec_streams() { + int dev_id = TargetWrapper::GetCurDevice(); + return devs[dev_id].exec_streams(); + } + + void SetSyncStreams(const std::vector& nums) { + sync_streams_.clear(); + std::vector exec_streams = all_exec_streams(); + for (size_t i = 0; i < nums.size(); ++i) { + CHECK(nums[i] >= 0 && nums[i] < static_cast(exec_streams.size())) + << "streams id is not valid"; + sync_streams_.push_back(exec_streams[nums[i]]); + } + InitSyncEvents(nums.size()); + } + + void InitSyncEvents(const int num) { + sync_events_.clear(); + for (int i = 0; i < num; ++i) { + cudaEvent_t eve; + TargetWrapperCuda::CreateEventWithFlags(&eve); + sync_events_.push_back(eve); + } + } + + void SetNeedSync(bool sync) { need_sync_ = sync; } + bool need_sync() const { return need_sync_; } + + void Sync() { + CHECK_EQ(sync_streams_.size(), sync_events_.size()); + for (size_t i = 0; i < sync_events_.size(); ++i) { + TargetWrapperCuda::RecordEvent(sync_events_[i], sync_streams_[i]); + TargetWrapperCuda::StreamSync(exec_stream_, sync_events_[i]); + } + } + + std::string name() const { return "CUDAContext"; } + + CUDAContext& operator=(const CUDAContext& context) { + this->Init( + context.device_id_, context.exec_stream_id_, context.io_stream_id_); + cublas_fp32_ = const_cast(context).cublas_fp32(); + return *this; + } + + private: + int device_id_; + // overall information + int exec_stream_id_; + int io_stream_id_; + cudaStream_t exec_stream_; + cudaStream_t io_stream_; + + // not thread-safe, should allocate for each thread. + std::shared_ptr> cublas_fp32_; + + // kernel information + std::vector input_events_; + std::vector output_events_; + // multi stream sync. + std::vector sync_streams_; + std::vector sync_events_; + bool need_sync_; +}; + +} // namespace lite +} // namespace paddle diff --git a/lite/backends/opencl/cl_context.cc b/lite/backends/opencl/cl_context.cc index 0edb83acc4772b2f878b22f2ea16b3175b14a7ba..eff959d992200592c21a024f56713b9abb4b87fb 100644 --- a/lite/backends/opencl/cl_context.cc +++ b/lite/backends/opencl/cl_context.cc @@ -58,7 +58,7 @@ void CLContext::AddKernel(const std::string &kernel_name, auto program = GetProgram(file_name, options); VLOG(3) << " --- end get program --- "; VLOG(3) << " --- to create kernel: " << kernel_name << " --- "; - std::unique_ptr kernel( + std::shared_ptr kernel( new cl::Kernel(program, kernel_name.c_str(), &status)); CL_CHECK_FATAL(status); VLOG(3) << " --- end create kernel --- "; diff --git a/lite/backends/opencl/cl_context.h b/lite/backends/opencl/cl_context.h index 586dc3df1267e47c6cdaad1d362cd9ed2df2770e..41059a0d42a95bbffed4c41611b9f3b8ac60861c 100644 --- a/lite/backends/opencl/cl_context.h +++ b/lite/backends/opencl/cl_context.h @@ -29,13 +29,14 @@ class CLContext { public: ~CLContext() { for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { - clReleaseKernel(kernels_[kidx]->get()); + // Note(ysh329): Don't need `clReleaseKernel` kernels_[kidx].reset(); } kernels_.clear(); kernel_offset_.clear(); for (auto &p : programs_) { - clReleaseProgram(p.second->get()); + // Note(ysh329): Dont't need `clReleaseProgram` + p.second.reset(); } programs_.clear(); LOG(INFO) << "release cl::Program, cl::Kernel finished."; @@ -66,9 +67,10 @@ class CLContext { int divitor = 2); // cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size, // size_t max_work_size); + private: std::unordered_map> programs_; - std::vector> kernels_; + std::vector> kernels_; std::map kernel_offset_; }; diff --git a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl index 9427692f1267d363222295b33b6834e28517d0a4..515bf57487ffd93959929ea93f76b0fdd888c4a5 100644 --- a/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl +++ b/lite/backends/opencl/cl_kernel/image/bilinear_interp_kernel.cl @@ -54,10 +54,10 @@ __kernel void bilinear_interp(__read_only image2d_t input, if (ceil_h > in_dims_h - 1) { ceil_h = in_dims_h- 1; } - float wight0_w = center_w - floor_w; - float wight0_h = center_h - floor_h; - float wight1_w = 1.0 - wight0_w; - float wight1_h = 1.0 - wight0_h; + CL_DTYPE wight0_w = center_w - floor_w; + CL_DTYPE wight0_h = center_h - floor_h; + CL_DTYPE wight1_w = 1.0 - wight0_w; + CL_DTYPE wight1_h = 1.0 - wight0_h; const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | @@ -92,5 +92,6 @@ __kernel void bilinear_interp(__read_only image2d_t input, CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h; + WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out); } diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index 8a6b026367986548b017aee263a70d4df33381b5..d5b2d70b09a84cb405c0e7c8f2b55f4254eb7f64 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -29,12 +29,12 @@ CLRuntime::~CLRuntime() { command_queue_->flush(); command_queue_->finish(); } - // For controlling the destruction order: + // For controlling the destruction order command_queue_.reset(); context_.reset(); device_.reset(); platform_.reset(); - LOG(INFO) << "release ~CLRuntime() "; + device_info_.clear(); } bool CLRuntime::Init() { diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 2a8996b066a480d9c0a6db67fa5fd60142885046..503b3a011642a8e018781c08647a958c521e6fac 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -55,7 +55,7 @@ class CLRuntime { std::map& GetDeviceInfo(); private: - CLRuntime() = default; + CLRuntime() { Init(); } ~CLRuntime(); diff --git a/lite/backends/rknpu/CMakeLists.txt b/lite/backends/rknpu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..cec60c80759cfc02e25a82eb795746c8b93e7cfe --- /dev/null +++ b/lite/backends/rknpu/CMakeLists.txt @@ -0,0 +1,5 @@ +if(NOT LITE_WITH_RKNPU) + return() +endif() + +lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs}) diff --git a/lite/backends/rknpu/device.cc b/lite/backends/rknpu/device.cc new file mode 100644 index 0000000000000000000000000000000000000000..5b486259b3b328713062648df445f94735ae6380 --- /dev/null +++ b/lite/backends/rknpu/device.cc @@ -0,0 +1,42 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/backends/rknpu/device.h" +#include "lite/utils/cp_logging.h" + +namespace paddle { +namespace lite { +namespace rknpu { + +std::unique_ptr Device::Build( + std::string& model_name, // NOLINT + rk::nn::Graph* rk_graph, // NOLINT + std::vector> input_nodes, // NOLINT + std::vector> output_nodes // NOLINT + ) { + VLOG(3) << "[RKNPU] Build model"; + + rk_graph->SetInputsOutputs(input_nodes, output_nodes); + + std::unique_ptr exector = + std::unique_ptr(new rk::nn::Exection(rk_graph)); + + exector->Build(); + + return exector; +} + +} // namespace rknpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/rknpu/device.h b/lite/backends/rknpu/device.h new file mode 100644 index 0000000000000000000000000000000000000000..9284725aac7fbd9840aef64b7e8f411059f9ba15 --- /dev/null +++ b/lite/backends/rknpu/device.h @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "rknpu/rknpu_pub.h" // NOLINT + +namespace paddle { +namespace lite { +namespace rknpu { + +class Device { + public: + static Device& Global() { + static Device x; + return x; + } + Device() {} + + // Build the RK IR graph to om model, return RK model exector to + // load om model and run inference. + std::unique_ptr Build( + std::string& model_name, // NOLINT + rk::nn::Graph* rk_graph, // NOLINT + std::vector> input_nodes, // NOLINT + std::vector> output_nodes // NOLINT + ); // NOLINT + + private: +}; + +} // namespace rknpu +} // namespace lite +} // namespace paddle diff --git a/lite/backends/x86/CMakeLists.txt b/lite/backends/x86/CMakeLists.txt index 63b41ae77d0f3949e3d1de13f9db5ca99b4f1c41..38b47ae3120608c7950a1f081e9ec2b133fb955e 100644 --- a/lite/backends/x86/CMakeLists.txt +++ b/lite/backends/x86/CMakeLists.txt @@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) endif(LITE_ON_MODEL_OPTIMIZE_TOOL) lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) -lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak) +lite_cc_library(x86_cpu_info SRCS cpu_info.cc) add_subdirectory(jit) add_subdirectory(math) diff --git a/lite/backends/x86/dynamic_loader.cc b/lite/backends/x86/dynamic_loader.cc index a05a57e93b23008e49683764b5ed669d5c425e5b..2aaa798fa94b7dd47e4dc15d50e663b8fd3c083a 100644 --- a/lite/backends/x86/dynamic_loader.cc +++ b/lite/backends/x86/dynamic_loader.cc @@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() { void* GetMKLMLDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) - return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib"); + return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib"); #elif defined(_WIN32) return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll"); #else diff --git a/lite/backends/x86/jit/gen/matmul.cc b/lite/backends/x86/jit/gen/matmul.cc index 010c80fac4842e74c9b8272db472ddf6cf954771..f78df73f66532f891721c74cff9c78cc3bb61922 100644 --- a/lite/backends/x86/jit/gen/matmul.cc +++ b/lite/backends/x86/jit/gen/matmul.cc @@ -40,7 +40,7 @@ void MatMulJitCode::genCode() { for (size_t g = 0; g < groups.size(); ++g) { size_t x_offset = 0; size_t wgt_offset_tmp = 0; - for (int i = 0; i < g; ++i) { + for (size_t i = 0; i < g; ++i) { wgt_offset_tmp += groups[i] * block_len; } for (int k = 0; k < k_; ++k) { diff --git a/lite/backends/x86/jit/gen_base.cc b/lite/backends/x86/jit/gen_base.cc index 7d051aa6f5802844753b71fd43400e20b7f5965b..a3376be423828b25c6eda6fff30a56578c7bbbe5 100644 --- a/lite/backends/x86/jit/gen_base.cc +++ b/lite/backends/x86/jit/gen_base.cc @@ -28,6 +28,12 @@ #define posix_memalign_free free #endif +#ifdef _WIN32 +#define posix_memalign_free _aligned_free +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#endif + // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode"); @@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const { void* GenBase::operator new(size_t size) { void* ptr; constexpr size_t alignment = 32ul; +#ifdef _WIN32 + ptr = _aligned_malloc(size, alignment); +#else PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), 0, "GenBase Alloc %ld error!", size); +#endif PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); return ptr; } diff --git a/lite/backends/x86/math/beam_search.cc b/lite/backends/x86/math/beam_search.cc index 9cf3281152840416dc141f98992499c663783b7a..5d7e98629cb89bd7a3fdee852507e0f381e54931 100644 --- a/lite/backends/x86/math/beam_search.cc +++ b/lite/backends/x86/math/beam_search.cc @@ -265,7 +265,7 @@ class BeamSearchFunctor { // size_t num_seqs = scores->NumElements(lod_level); size_t num_seqs = scores->lod()[lod_level].size() - 1; size_t seq_width = 1; - for (int i = 1; i < scores->dims().size(); i++) { + for (size_t i = 1; i < scores->dims().size(); i++) { seq_width *= scores->dims()[i]; } diff --git a/lite/backends/x86/math/blas.cc b/lite/backends/x86/math/blas.cc index 2d21adaf5d22930ff720c193696eb00c8035579d..3bc5f9f67ad96e7ec699400ff6369fe48c745b7e 100644 --- a/lite/backends/x86/math/blas.cc +++ b/lite/backends/x86/math/blas.cc @@ -23,7 +23,7 @@ namespace math { MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, int num_flatten_cols, bool trans) { - PADDLE_ENFORCE_GT(tensor_dim.size(), 1); + PADDLE_ENFORCE_GT(tensor_dim.size(), 1u); MatDescriptor retv; if (num_flatten_cols > 1) { auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); diff --git a/lite/backends/x86/math/sequence_pooling.cc b/lite/backends/x86/math/sequence_pooling.cc index 34c55c5714e467954bc1bb79d9b1385ef5cfe497..2d00ebad61840da5b14fbf12d9255394b2b2df1a 100644 --- a/lite/backends/x86/math/sequence_pooling.cc +++ b/lite/backends/x86/math/sequence_pooling.cc @@ -46,9 +46,9 @@ class MaxSeqPoolFunctor { auto in_dims = input.dims(); auto out_dims = output->dims(); auto idx_dims = index->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1); - PADDLE_ENFORCE_GT(out_dims.size(), 1); - for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_GT(in_dims.size(), 1u); + PADDLE_ENFORCE_GT(out_dims.size(), 1u); + for (size_t i = 1; i < in_dims.size(); ++i) { PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); } PADDLE_ENFORCE_EQ(idx_dims, out_dims); @@ -95,9 +95,9 @@ class MaxSeqPoolFunctor { lite::Tensor* index) { auto in_dims = input.dims(); auto out_dims = output->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), 1); - PADDLE_ENFORCE_GT(out_dims.size(), 1); - for (int64_t i = 1; i < in_dims.size(); ++i) { + PADDLE_ENFORCE_GT(in_dims.size(), 1u); + PADDLE_ENFORCE_GT(out_dims.size(), 1u); + for (size_t i = 1; i < in_dims.size(); ++i) { PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); } @@ -138,7 +138,7 @@ class MaxSeqPoolGradFunctor { auto idx_dims = index.dims(); PADDLE_ENFORCE_GT(og_dims.size(), 1); PADDLE_ENFORCE_GT(ig_dims.size(), 1); - for (int64_t i = 1; i < og_dims.size(); ++i) { + for (size_t i = 1; i < og_dims.size(); ++i) { PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); } PADDLE_ENFORCE_EQ(idx_dims, og_dims); diff --git a/lite/backends/x86/parallel.h b/lite/backends/x86/parallel.h index 0689ec4c234509cee6f10f8e0f7dd432edae5c4e..49794b8e15a8f90a6512798baa842534df879f6b 100644 --- a/lite/backends/x86/parallel.h +++ b/lite/backends/x86/parallel.h @@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() { // Do not support nested omp parallem. num_threads = omp_in_parallel() ? 1 : omp_get_max_threads(); #endif - return std::max(num_threads, 1L); + return std::max(num_threads, 1L); } using ThreadHandler = diff --git a/lite/backends/x86/port.h b/lite/backends/x86/port.h index c1b81159aca979efe4b46777a1cef49e44b95e27..0e1e2b77b796eae201c55edcd3caecc263e4271e 100644 --- a/lite/backends/x86/port.h +++ b/lite/backends/x86/port.h @@ -14,10 +14,10 @@ #pragma once +#include #include #include -#include #include #include @@ -37,7 +37,9 @@ #define GOOGLE_GLOG_DLL_DECL #include // _popen, _pclose #include +#define NOMINMAX // msvc max/min macro conflict with std::min/max #include +#include #include // std::accumulate in msvc #ifndef S_ISDIR // windows port for sys/stat.h #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) @@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) { return reinterpret_cast(hModule); } +extern struct timeval; static int gettimeofday(struct timeval *tp, void *tzp) { time_t clock; struct tm tm; diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt index 278f971b0b1ee8a0b941158839fcc6810e25ad67..55c83cdb4d02d485054ea4d7f3b90fb9f7aa3dc1 100644 --- a/lite/core/CMakeLists.txt +++ b/lite/core/CMakeLists.txt @@ -24,13 +24,8 @@ if (NOT LITE_ON_TINY_PUBLISH) proto_library(framework_proto SRCS framework.proto) endif() -if (LITE_WITH_X86) lite_cc_library(variable SRCS variable.cc DEPS tensor) lite_cc_library(types SRCS types.cc) -else() -lite_cc_library(variable SRCS variable.cc DEPS tensor) -lite_cc_library(types SRCS types.cc) -endif() lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel) lite_cc_library(scope SRCS scope.cc DEPS tensor) lite_cc_library(device_info SRCS device_info.cc DEPS tensor) @@ -38,7 +33,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor) if (LITE_WITH_ARM) lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context) else() -lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context) +lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context CUDA_DEPS cuda_context) endif() #-------------------------------------------- GET CODE META INFO ------------------------------------------ diff --git a/lite/core/arena/CMakeLists.txt b/lite/core/arena/CMakeLists.txt index afc104073684ff00395fb32335630705ff3f7bc8..75971570fb078ce4e39413e5b3df629fe2a7ac3e 100644 --- a/lite/core/arena/CMakeLists.txt +++ b/lite/core/arena/CMakeLists.txt @@ -6,5 +6,5 @@ endif() lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) - lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) + lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) endif() diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc index 614ee990a9811ab74ceedb4fa000fa385698d679..731215f542567ec3ff0cc87d6990624bfa6b2bc2 100644 --- a/lite/core/arena/framework.cc +++ b/lite/core/arena/framework.cc @@ -107,7 +107,7 @@ void TestCase::PrepareInputsForInstruction() { CHECK(!shared_tensor_array->empty()) << "shared_tensor_array is empty yet"; target_tensor_array->resize(shared_tensor_array->size()); - for (int i = 0; i < shared_tensor_array->size(); i++) { + for (size_t i = 0; i < shared_tensor_array->size(); i++) { target_tensor_array->at(i).Resize( shared_tensor_array->at(i).dims()); TargetCopy(param_type->type->target(), @@ -219,7 +219,7 @@ bool TestCase::CheckPrecision(const std::string& var_name, auto b_tensor_array = base_scope_->FindVar(var_name)->GetMutable>(); CHECK_EQ(a_tensor_array->size(), b_tensor_array->size()); - for (int i = 0; i < a_tensor_array->size(); i++) { + for (size_t i = 0; i < a_tensor_array->size(); i++) { Tensor* a_tensor = &(a_tensor_array->at(i)); Tensor* b_tensor = &(b_tensor_array->at(i)); if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) { diff --git a/lite/core/arena/framework.h b/lite/core/arena/framework.h index 7050355fbfae55b9ba626119cd95f8e952c27430..20a0792155f0b4ea8faa7c3fc15ea5c4767352ac 100644 --- a/lite/core/arena/framework.h +++ b/lite/core/arena/framework.h @@ -166,7 +166,7 @@ class TestCase { // TODO(Superjomn) Move this method to utils or DDim? bool ShapeEquals(const DDim& a, const DDim& b) { if (a.size() != b.size()) return false; - for (int i = 0; i < a.size(); i++) { + for (size_t i = 0; i < a.size(); i++) { if (a[i] != b[i]) return false; } return true; diff --git a/lite/core/context.h b/lite/core/context.h index 061638d63f5187bbfe296afbc3679d9b390a6457..bacb570a903d807945cb9e2a8b98615fcaba9384 100644 --- a/lite/core/context.h +++ b/lite/core/context.h @@ -16,8 +16,7 @@ #include "lite/utils/any.h" #ifdef LITE_WITH_CUDA -#include "lite/backends/cuda/blas.h" -#include "lite/backends/cuda/cuda_utils.h" +#include "lite/backends/cuda/context.h" #endif #ifdef LITE_WITH_OPENCL #include @@ -53,14 +52,15 @@ class Context; using HostContext = Context; using X86Context = Context; -using CUDAContext = Context; using ARMContext = Context; using NPUContext = Context; +using APUContext = Context; using XPUContext = Context; using OpenCLContext = Context; using FPGAContext = Context; using BMContext = Context; using MLUContext = Context; +using RKNPUContext = Context; template <> class Context { @@ -88,6 +88,21 @@ class Context { }; #endif +#ifdef LITE_WITH_APU +template <> +class Context { + public: + Context() {} + explicit Context(const APUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(APUContext* ctx) {} + + APUContext& operator=(const APUContext& ctx) {} + std::string name() const { return "APUContext"; } +}; +#endif + #ifdef LITE_WITH_BM template <> class Context { @@ -105,6 +120,21 @@ class Context { }; #endif +#ifdef LITE_WITH_RKNPU +template <> +class Context { + public: + Context() {} + explicit Context(const RKNPUContext& ctx); + // NOTE: InitOnce should only be used by ContextScheduler + void InitOnce() {} + void CopySharedTo(RKNPUContext* ctx) {} + + RKNPUContext& operator=(const RKNPUContext& ctx) {} + std::string name() const { return "RKNPUContext"; } +}; +#endif + #ifdef LITE_WITH_XPU template <> class Context { @@ -286,103 +316,6 @@ class Context { }; #endif // LITE_WITH_MLU -#ifdef LITE_WITH_CUDA -// Only works with CUDA kernels. -template <> -class Context { - public: - typename Env::Devs& devs = - Env::Global(); - // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() { - if (devs.size() > 0) { - cublas_fp32_ = std::make_shared>(); - } else { - LOG(INFO) << "No cuda device(s) found, CUDAContext init failed."; - } - } - void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) { - CHECK_GT(devs.size(), 0UL) - << "Env is not initialized or current target is not exit!"; - if (dev_id >= static_cast(devs.size())) { - LOG(WARNING) << "device index exceeds the number of devices, set to " - "default device(0)!"; - device_id_ = 0; - } else { - device_id_ = dev_id; - } - if (io_stream_id >= devs[dev_id].max_stream()) { - LOG(WARNING) << "data stream index exceeds the maximum stream number, " - "set to default stream(0)!"; - io_stream_id = 0; - } - if (exec_stream_id >= devs[dev_id].max_stream()) { - LOG(WARNING) << "exec stream index exceeds the maximum stream number, " - "set to default stream(0)!"; - exec_stream_id = 0; - } - - exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id]; - io_stream_ = devs[dev_id].io_streams()[io_stream_id]; - - exec_stream_id_ = exec_stream_id; - io_stream_id_ = io_stream_id; - } - void CopySharedTo(CUDAContext* ctx) { - CHECK(ctx); - CHECK(cublas_fp32_) << "cublas_fp32 should be set first"; - ctx->cublas_fp32_ = cublas_fp32_; - } - - const cudaStream_t& exec_stream() const { return exec_stream_; } - void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; } - - const cudaStream_t& io_stream() const { return io_stream_; } - void SetIoStream(cudaStream_t stream) { io_stream_ = stream; } - - std::shared_ptr> cublas_fp32() { return cublas_fp32_; } - void SetCuBlasFP32(std::shared_ptr> cublas_fp32) { - cublas_fp32_ = cublas_fp32; - } - - const std::vector& input_events() { return input_events_; } - void SetInputEvents(const std::vector& input_events) { - input_events_.clear(); - input_events_.assign(input_events.begin(), input_events.end()); - } - - const std::vector& output_events() { return output_events_; } - void SetOutputEvents(const std::vector& output_events) { - output_events_.clear(); - output_events_.assign(output_events.begin(), output_events.end()); - } - - std::string name() const { return "CUDAContext"; } - - CUDAContext& operator=(const CUDAContext& context) { - this->Init( - context.device_id_, context.exec_stream_id_, context.io_stream_id_); - cublas_fp32_ = const_cast(context).cublas_fp32(); - return *this; - } - - private: - int device_id_; - // overall information - int exec_stream_id_; - int io_stream_id_; - cudaStream_t exec_stream_; - cudaStream_t io_stream_; - - // not thread-safe, should allocate for each thread. - std::shared_ptr> cublas_fp32_; - - // kernel information - std::vector input_events_; - std::vector output_events_; -}; -#endif - #ifdef LITE_WITH_X86 template <> class Context { @@ -455,7 +388,9 @@ class ContextScheduler { return *x; } - std::unique_ptr NewContext(TargetType target) { + std::unique_ptr NewContext( + TargetType target, + /*only used for cuda context*/ int exec_stream_id = 0) { std::unique_ptr ctx(new KernelContext); switch (target) { case TARGET(kHost): @@ -472,7 +407,7 @@ class ContextScheduler { case TARGET(kCUDA): { int dev_id = TargetWrapper::GetCurDevice(); auto& context = ctx->As(); - context.Init(dev_id); + context.Init(dev_id, exec_stream_id); kernel_contexts_[TargetType::kCUDA].As().CopySharedTo( &context); } break; @@ -489,6 +424,18 @@ class ContextScheduler { &ctx->As()); break; #endif +#ifdef LITE_WITH_APU + case TARGET(kAPU): + kernel_contexts_[TargetType::kAPU].As().CopySharedTo( + &ctx->As()); + break; +#endif +#ifdef LITE_WITH_RKNPU + case TARGET(kRKNPU): + kernel_contexts_[TargetType::kRKNPU].As().CopySharedTo( + &ctx->As()); + break; +#endif #ifdef LITE_WITH_XPU case TARGET(kXPU): kernel_contexts_[TargetType::kXPU].As().CopySharedTo( @@ -558,6 +505,12 @@ class ContextScheduler { #ifdef LITE_WITH_NPU InitContext(); #endif +#ifdef LITE_WITH_APU + InitContext(); +#endif +#ifdef LITE_WITH_RKNPU + InitContext(); +#endif #ifdef LITE_WITH_XPU InitContext(); #endif diff --git a/lite/core/device_info.cc b/lite/core/device_info.cc index 29ac96ed744b016833a746b35002dd68109efd8b..09da06a4168268c670577c159a2a306a8959d81d 100644 --- a/lite/core/device_info.cc +++ b/lite/core/device_info.cc @@ -947,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) { active_ids_ = core_ids_; } else { active_ids_.resize(thread_num); - for (int i = 0; i < thread_num; ++i) { + for (uint32_t i = 0; i < thread_num; ++i) { if (i < big_core_ids_.size()) { active_ids_[i] = big_core_ids_[i]; } else { diff --git a/lite/core/device_info.h b/lite/core/device_info.h index a108ae3d4b564aaac02a63ead9a35eba26a6cf63..b06eb8d944735971133bb7a29aa0f06075e60626 100644 --- a/lite/core/device_info.h +++ b/lite/core/device_info.h @@ -159,7 +159,7 @@ class Env { static Devs* devs = new Devs(); return *devs; } - static void Init(int max_stream = 4) { + static void Init(int max_stream = 6) { #ifdef LITE_WITH_MLU CNRT_CALL(cnrtInit(0)); #endif @@ -175,6 +175,7 @@ class Env { } else { LOG(INFO) << "Found " << count << " device(s)"; } + CHECK_GT(max_stream, 0) << "max_stream must be greater than 0."; // create all device for (int i = 0; i < count; i++) { auto dev = Device(i, max_stream); @@ -234,8 +235,8 @@ class Device { std::string name() { return device_prop_.name; } int core_num() { return device_prop_.multiProcessorCount; } float max_memory() { return device_prop_.totalGlobalMem / 1048576.; } - std::vector exec_streams() { return exec_stream_; } - std::vector io_streams() { return io_stream_; } + const std::vector& exec_streams() { return exec_stream_; } + const std::vector& io_streams() { return io_stream_; } int sm_version() { return sm_version_; } bool has_fp16() { return has_fp16_; } diff --git a/lite/core/kernel.cc b/lite/core/kernel.cc index 7ec718cb3881c10dec08376419b419777c71bba6..194d736a4c0cf6fa18eae119589c5fa1fd08bca0 100644 --- a/lite/core/kernel.cc +++ b/lite/core/kernel.cc @@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type, std::string *alias, Place *place) { auto parts = Split(kernel_type, "/"); - CHECK_EQ(parts.size(), 5); + CHECK_EQ(parts.size(), 5u); *op_type = parts[0]; *alias = parts[1]; diff --git a/lite/core/mir/CMakeLists.txt b/lite/core/mir/CMakeLists.txt index 91accc907ed16b2de64e5982b88d38029fd2902b..d036bf7988b98e64586e42683d33b4696e9ff706 100644 --- a/lite/core/mir/CMakeLists.txt +++ b/lite/core/mir/CMakeLists.txt @@ -37,6 +37,7 @@ lite_cc_library(mir_passes demo_pass.cc runtime_context_assign_pass.cc memory_optimize_pass.cc + multi_stream_analysis_pass.cc mlu_postprocess_pass.cc weight_quantization_preprocess_pass.cc quantized_op_attributes_inference_pass.cc diff --git a/lite/core/mir/fusion/conv_bn_fuser.cc b/lite/core/mir/fusion/conv_bn_fuser.cc index 150a6e68d8a924ebfa96fdffb99e28b230689a48..143a7cecce8c1c45ada9ad31e8e4bea5447fec68 100644 --- a/lite/core/mir/fusion/conv_bn_fuser.cc +++ b/lite/core/mir/fusion/conv_bn_fuser.cc @@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { } size_t weight_num = conv_weight_t->data_size(); bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; - bool is_weight_quantization = - conv_op_desc->HasAttr("quantize_weight_bits") ? true : false; + bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits"); // comupte BN alpha and beta Tensor alpha_tensor, beta_tensor; @@ -164,23 +163,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; - for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) { - for (unsigned int i = 0; i < h; ++i) { + for (int k = 0; k < conv_weight_t->dims()[0]; ++k) { + for (int i = 0; i < h; ++i) { weight_scale[i] *= fabsf(alpha_data[i]); if (alpha_data[i] < 0.f) { auto ptr_row = conv_weight_d + k * c_size + i * hw; - for (unsigned int j = 0; j < hw; ++j) { + for (int j = 0; j < hw; ++j) { ptr_row[j] *= -1; } } } } } else { - for (unsigned int i = 0; i < h; ++i) { + for (int i = 0; i < h; ++i) { weight_scale[i] *= fabsf(alpha_data[i]); if (alpha_data[i] < 0.f) { auto ptr_row = conv_weight_d + i * w; - for (unsigned int j = 0; j < w; ++j) { + for (int j = 0; j < w; ++j) { ptr_row[j] *= -1; } } @@ -204,17 +203,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; - for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) { - for (unsigned int i = 0; i < h; ++i) { + for (int k = 0; k < conv_weight_t->dims()[0]; ++k) { + for (int i = 0; i < h; ++i) { auto ptr_row = conv_weight_d + k * c_size + i * hw; - for (unsigned int j = 0; j < hw; ++j) { + for (int j = 0; j < hw; ++j) { ptr_row[j] *= alpha_data[i]; } } } } else { - for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels - for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels + for (int i = 0; i < h; ++i) { // n: conv2d output channels + for (int j = 0; j < w; ++j) { // w: conv2d input channels conv_weight_d[i * w + j] *= alpha_data[i]; } } diff --git a/lite/core/mir/fusion/quant_dequant_op_fuser.cc b/lite/core/mir/fusion/quant_dequant_op_fuser.cc index a3a98b871fb4b6f8230299cda978b0f1f8faa779..2c7cc2fe5547d6004ded99f28698478cec0a3639 100644 --- a/lite/core/mir/fusion/quant_dequant_op_fuser.cc +++ b/lite/core/mir/fusion/quant_dequant_op_fuser.cc @@ -260,7 +260,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, auto channel_scale_tensor = scope->FindVar(channel_scale_name)->GetMutable(); auto* channel_scale_data = channel_scale_tensor->data(); - for (int i = 0; i < channel_scale_tensor->data_size(); i++) { + for (size_t i = 0; i < channel_scale_tensor->data_size(); i++) { weight_scale.push_back(channel_scale_data[i] / range); } diff --git a/lite/core/mir/generate_program_pass.cc b/lite/core/mir/generate_program_pass.cc index 76c97d2da6ed9e7c6fc1f1889d80095278b68ec0..d7486c0933dbbe74115bd6358962817b2b946c12 100644 --- a/lite/core/mir/generate_program_pass.cc +++ b/lite/core/mir/generate_program_pass.cc @@ -14,6 +14,7 @@ #include "lite/core/mir/generate_program_pass.h" #include +#include #include #include #include "lite/core/mir/graph_visualize_pass.h" @@ -25,10 +26,37 @@ namespace mir { void GenerateProgramPass::Apply(const std::unique_ptr& graph) { VLOG(4) << "final program \n" << Visualize(graph.get()); - for (auto& item : graph->StmtTopologicalOrder()) { + std::vector nodes_in_order; +#ifdef LITE_WITH_CUDA + const std::string depend_pass = "multi_stream_analysis_pass"; + const std::string attr_name = "nodes_in_order"; + mir::Pass* pass = mir::PassManager::Global().LookUp(depend_pass); + if (pass->HasAttr(attr_name)) { + nodes_in_order = pass->GetAttr>(attr_name); + } +#endif + if (nodes_in_order.empty()) { + nodes_in_order = graph->StmtTopologicalOrder(); + } + + for (auto& item : nodes_in_order) { if (item->IsStmt()) { auto& stmt = item->AsStmt(); VLOG(4) << stmt; +#ifdef LITE_WITH_CUDA + if (stmt.kernels().front()->target() == TargetType::kCUDA) { + stmt.kernels() + .front() + ->mutable_context() + ->As() + .SetNeedSync(stmt.need_sync_); + stmt.kernels() + .front() + ->mutable_context() + ->As() + .SetSyncStreams(stmt.sync_streams_); + } +#endif insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); } } diff --git a/lite/core/mir/graph_visualize_pass.cc b/lite/core/mir/graph_visualize_pass.cc index a32c9c05f69e5c31b77bc0d2ff976560f29b9bec..55b7a004567ec5a5298e084839d6dcf5a8591882 100644 --- a/lite/core/mir/graph_visualize_pass.cc +++ b/lite/core/mir/graph_visualize_pass.cc @@ -85,7 +85,23 @@ std::string Visualize(mir::SSAGraph* graph) { if (!node->IsStmt()) continue; auto op_info = node->AsStmt().op_info(); auto op_type = op_info->Type(); - std::string op_name = string_format("%s%d", op_type.c_str(), op_idx++); + std::string op_name; + if (node->AsStmt().need_sync_) { + std::ostringstream oss; + for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) { + oss << std::to_string(node->AsStmt().sync_streams_[i]); + if (i != node->AsStmt().sync_streams_.size() - 1) { + oss << ","; + } + } + op_name = string_format("%s%d, stream=%d, sync_streams={%s}", + op_type.c_str(), + op_idx++, + node->AsStmt().stream_id_, + oss.str().c_str()); + } else { + op_name = string_format("%s%d", op_type.c_str(), op_idx++); + } // Add its input&output variables as the Dot nodes dot.AddNode(op_name, {Dot::Attr("shape", "box"), @@ -93,7 +109,13 @@ std::string Visualize(mir::SSAGraph* graph) { Dot::Attr("color", "black"), Dot::Attr("fillcolor", "yellow")}); for (auto& x : node->inlinks) { - auto var_name = x->AsArg().name; + std::string var_name; + if (x->AsArg().lane != -1) { + var_name = string_format( + "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane); + } else { + var_name = x->AsArg().name; + } if (!exists_var_names.count(var_name)) { dot.AddNode(var_name, {}); exists_var_names.insert(var_name); @@ -101,7 +123,13 @@ std::string Visualize(mir::SSAGraph* graph) { dot.AddEdge(var_name, op_name, {}); } for (auto& x : node->outlinks) { - auto var_name = x->AsArg().name; + std::string var_name; + if (x->AsArg().lane != -1) { + var_name = string_format( + "%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane); + } else { + var_name = x->AsArg().name; + } if (!exists_var_names.count(var_name)) { dot.AddNode(var_name, {}); exists_var_names.insert(var_name); diff --git a/lite/core/mir/memory_optimize_pass.cc b/lite/core/mir/memory_optimize_pass.cc index 38293ede76ed35bf05767ce1333947b7dfdbc4ac..12b4eab0a9582af6d2d4abd3941e75b99a3e39a6 100644 --- a/lite/core/mir/memory_optimize_pass.cc +++ b/lite/core/mir/memory_optimize_pass.cc @@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) .BindTargets({TARGET(kARM), TARGET(kOpenCL)}) - .ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)}); + .ExcludeTargets({TARGET(kNPU), + TARGET(kXPU), + TARGET(kBM), + TARGET(kRKNPU), + TARGET(kAPU)}); diff --git a/lite/core/mir/mlu_postprocess_pass.cc b/lite/core/mir/mlu_postprocess_pass.cc index 15f62f36b0f026dc42ecbb274c946e294c7fc44e..ba48d5d4ead5ea922ded0bff3a87c2c127595790 100644 --- a/lite/core/mir/mlu_postprocess_pass.cc +++ b/lite/core/mir/mlu_postprocess_pass.cc @@ -292,7 +292,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, // get subgraph op's type info size_t kernel_size = inst_node->AsStmt().kernels().size(); - CHECK_GT(kernel_size, 0); + CHECK_GT(kernel_size, 0u); VLOG(4) << "subgraph kernel size: " << kernel_size; for (size_t i = 0; i < kernel_size; ++i) { @@ -450,7 +450,7 @@ bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) { auto* block_desc = static_cast(inst->AsStmt().op().get()) ->GetSubBlock(); - for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) { + for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) { auto op_desc = block_desc->GetOp(op_idx); CHECK(op_desc); if (op_desc->Type() == "conv2d") { diff --git a/lite/core/mir/multi_stream_analysis_pass.cc b/lite/core/mir/multi_stream_analysis_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..46454a1fc357c7d96162a58a43a6c34bc890bc69 --- /dev/null +++ b/lite/core/mir/multi_stream_analysis_pass.cc @@ -0,0 +1,313 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/core/mir/multi_stream_analysis_pass.h" + +#include +#include +#include +#include + +#include "lite/core/device_info.h" +#include "lite/core/mir/graph_visualize_pass.h" +#include "lite/core/mir/pass_registry.h" +#include "lite/core/type_system.h" + +namespace paddle { +namespace lite { +namespace mir { + +void MultiStreamAnalysisPass::CleanUp() { + exec_ops_.clear(); + wait_que_.clear(); + wait_que_cpu_.clear(); + std::queue empty_queue; + while (!exec_que_.empty()) { + exec_que_.pop(); + } + ops_in_streams_.clear(); + resources_.clear(); + map_arg_to_lane_.clear(); + op_types_set_.clear(); + io_copy_once_num_ = 0; +} + +void MultiStreamAnalysisPass::Init(SSAGraph* graph) { + // If not cleaned, the clone will overlay the previous state + CleanUp(); + for (auto& op_node : graph->StmtTopologicalOrder()) { + if (op_node->IsStmt()) { + // Set all outputs of op to inaccessible state. + auto outputs = op_node->outlinks; + for (Node* node : outputs) { + CHECK(node->IsArg()); + auto& arg = node->AsArg(); + if (!resources_.count(arg.name)) { + resources_[arg.name] = false; + } + } + // Set the weight input of op to be accessible. + auto inputs = op_node->inlinks; + for (Node* node : inputs) { + CHECK(node->IsArg()); + auto& arg = node->AsArg(); + if (arg.is_weight || arg.is_persist) { + resources_[arg.name] = true; + } + } + + // feed and io_copy_once op has no dependencies and can be launched + // directly. Other ops are put into the waiting queue. + if (op_node->AsStmt().op_type() == "feed" || + op_node->AsStmt().op_type() == "io_copy_once") { + exec_que_.push(op_node); + } else { + auto tgt = op_node->AsStmt().kernels().front()->target(); + if (tgt == TargetType::kCUDA) { + wait_que_.push_back(op_node); + } else { + wait_que_cpu_.push_back(op_node); + } + } + op_types_set_.insert(op_node->AsStmt().op_type()); + } + } + + // Set the stream id according to the number of feed ops, and set the output + // of the feed op to be accessible. + int lane = 0; + auto nodes = graph->inputs(); + ops_in_streams_.resize(max_stream_); + + for (auto& node : nodes) { + std::string::size_type idx = node->AsArg().name.find("feed"); + if (idx != std::string::npos) { + for (auto& feed_ops : node->outlinks) { + if (feed_ops->AsStmt().op_type() == "feed") { + // feed op doesn't need to wait sync. + feed_ops->AsStmt().need_sync_ = false; + CHECK_EQ(static_cast(feed_ops->outlinks.size()), 1) + << "feed op must have one output."; + for (auto& var : feed_ops->outlinks) { + var->AsArg().lane = lane; + map_arg_to_lane_[var->AsArg().name] = lane; + resources_[var->AsArg().name] = true; + } + feed_ops->AsStmt().stream_id_ = lane; + ops_in_streams_[lane].push_back(feed_ops); + ++lane; + if (lane >= max_stream_) { + lane = 0; + } + } + } + } + // set all io_copy_once op in the first stream + for (auto& io_copy_once_ops : node->outlinks) { + if (io_copy_once_ops->AsStmt().op_type() == "io_copy_once") { + ops_in_streams_[0].push_back(io_copy_once_ops); + io_copy_once_ops->AsStmt().stream_id_ = 0; + io_copy_once_ops->AsStmt().need_sync_ = false; + ++io_copy_once_num_; + } + } + } +} + +bool MultiStreamAnalysisPass::CheckOpSupport() { + std::unordered_set invalid_op = { + "while", "conditional_block", "conditional_block_infer", "graph_op"}; + for (auto& op_type : op_types_set_) { + if (invalid_op.count(op_type)) { + LOG(INFO) << "multi_stream_analysis_pass don't support " << op_type + << ", just return."; + return false; + } + } + return true; +} + +bool MultiStreamAnalysisPass::IsPrepared(Node* stmt_node) { + // feed op are prepared when init. + std::string op_name = stmt_node->AsStmt().op_type(); + if (op_name == "feed") { + return true; + } + + // Check is op's input are all accessible. + std::vector args; + for (auto* ins : stmt_node->inlinks) { + args.push_back(ins->AsArg().name); + } + return CheckAccess(args); +} + +bool MultiStreamAnalysisPass::CheckAccess( + const std::vector& args) { + if (args.size() == 0) { + return true; + } + for (auto& name : args) { + if (resources_[name]) { + continue; + } else { + return false; + } + } + return true; +} + +int MultiStreamAnalysisPass::SelectStreamId(const std::vector& lanes) { + if (lanes.size() == 0) { + return 0; + } + + int res = lanes[0]; + int exclude_io_copy_once_num = ops_in_streams_[0].size() - io_copy_once_num_; + int min_num = lanes[0] == 0 ? exclude_io_copy_once_num + : ops_in_streams_[lanes[0]].size(); + for (size_t i = 1; i < lanes.size(); ++i) { + int ith_num = lanes[i] == 0 ? exclude_io_copy_once_num + : ops_in_streams_[lanes[i]].size(); + if (ith_num < min_num) { + res = lanes[i]; + min_num = ith_num; + } + } + + return res; +} + +void MultiStreamAnalysisPass::Launch(Node* stmt_node) { + // record ops launch order. + exec_que_.push(stmt_node); + std::vector lanes; + for (auto& in_arg : stmt_node->inlinks) { + // Weight parameter does not involve stream id, so just skip it. + if (in_arg->AsArg().is_weight || in_arg->AsArg().is_persist) { + continue; + } + + if (std::find(lanes.begin(), lanes.end(), in_arg->AsArg().lane) == + lanes.end()) { + lanes.push_back(in_arg->AsArg().lane); + } + } + + int stream_id = SelectStreamId(lanes); + + // If all inputs of the op are on multiple streams, they need to be + // synchronized + if (lanes.size() > 1) { + for (size_t i = 0; i < lanes.size(); ++i) { + if (lanes[i] != stream_id) { + stmt_node->AsStmt().sync_streams_.push_back(lanes[i]); + } + } + stmt_node->AsStmt().need_sync_ = true; + } + // io_copy are nodes inserted across devices and need to be synced. + if (stmt_node->AsStmt().op_type() == "io_copy") { + stmt_node->AsStmt().need_sync_ = true; + } + stmt_node->AsStmt().stream_id_ = stream_id; + + // set output lane and set the output of op to be accessible. + for (auto& out_arg : stmt_node->outlinks) { + out_arg->AsArg().lane = stream_id; + resources_[out_arg->AsArg().name] = true; + } + ops_in_streams_[stream_id].push_back(stmt_node); +} + +void MultiStreamAnalysisPass::Apply(const std::unique_ptr& graph) { +#ifdef LITE_WITH_CUDA + typename Env::Devs& devs = + Env::Global(); + int dev_id = TargetWrapper::GetCurDevice(); + max_stream_ = devs[dev_id].max_stream(); +#else + LOG(FATAL) << "Please re-compile by setting the cmake flag LITE_WITH_CUDA=ON"; +#endif + + // Find the correct startup sequence for op. + Init(graph.get()); + bool is_valid = CheckOpSupport(); + if (!is_valid) { + return; + } + size_t prev_size; + + while (!(this->wait_que_.empty() && this->wait_que_cpu_.empty())) { + prev_size = this->wait_que_.size() + this->wait_que_cpu_.size(); + // launch the acessible cuda kernel and remove it from wait que. + for (auto it = this->wait_que_.begin(); it != this->wait_que_.end();) { + if (IsPrepared(*it)) { + Launch(*it); + it = wait_que_.erase(it); + } else { + ++it; + } + } + // launch the accessible cpu kernel and remove it from wait que. + for (auto cpu_it = this->wait_que_cpu_.begin(); + cpu_it != this->wait_que_cpu_.end();) { + if (IsPrepared(*cpu_it)) { + Launch(*cpu_it); + cpu_it = wait_que_cpu_.erase(cpu_it); + } else { + ++cpu_it; + } + } + + if (this->wait_que_.size() + this->wait_que_cpu_.size() == prev_size) { + LOG(FATAL) << "network topo error!"; + } + } + + // Get exec ops order. + while (!exec_que_.empty()) { + auto* node = exec_que_.front(); + exec_ops_.push_back(node); + VLOG(4) << node->AsStmt().op_type() + << " stream: " << node->AsStmt().stream_id_ + << ", sync: " << node->AsStmt().need_sync_; + if (node->AsStmt().need_sync_) { + for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) { + VLOG(4) << " " << node->AsStmt().sync_streams_[i]; + } + } + exec_que_.pop(); + } + + // Set attribute parameters, for passing parameters between passes + const std::string attr_name{"nodes_in_order"}; + SetAttr>(attr_name, &exec_ops_); + + LOG(INFO) << "stream " << 0 << " has " + << ops_in_streams_[0].size() - io_copy_once_num_ + << " ops. (exclude io_copy_once)."; + for (size_t i = 1; i < ops_in_streams_.size(); ++i) { + LOG(INFO) << "stream " << i << " has " << ops_in_streams_[i].size() + << " ops."; + } +} + +} // namespace mir +} // namespace lite +} // namespace paddle + +REGISTER_MIR_PASS(multi_stream_analysis_pass, + paddle::lite::mir::MultiStreamAnalysisPass) + .BindTargets({TARGET(kCUDA)}); diff --git a/lite/core/mir/multi_stream_analysis_pass.h b/lite/core/mir/multi_stream_analysis_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..37a7feca3a1200ad7ff26ef8fc0317deee9d174e --- /dev/null +++ b/lite/core/mir/multi_stream_analysis_pass.h @@ -0,0 +1,85 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "lite/core/kernel.h" +#include "lite/core/mir/pass.h" + +namespace paddle { +namespace lite { +namespace mir { + +/* + * MultiStreamAnalysisPass will find the correct launch sequence for all ops. + * Ideally, the order should be multiple asynchronous ops and a small number of + * synchronous ops. + */ +class MultiStreamAnalysisPass : public StmtPass { + public: + void Apply(const std::unique_ptr& graph) override; + + private: + // Init resource list. Set all ops except feed to inaccessible state and set + // stream id according to the numer of inputs. + void Init(SSAGraph* graph); + + // Clean state information of all member variables. + void CleanUp(); + + // After launching, unlock the output resources of op. + void Launch(Node* stmt_node); + + // If all inputs of an op are accessible, the op is considered to be in the + // prepared state + bool IsPrepared(Node* stmt_node); + + // Determine if all inputs of op are accessible. + bool CheckAccess(const std::vector& args); + + // The logic of selecting a stream: + // 1. Make the number of ops on each stream as close as possible. + // 2. The selected stream must be one of the streams contained in the input + // arg + int SelectStreamId(const std::vector& lanes); + + // Check if the model's ops are all supported. If you encounter unsupported + // ops, exit + bool CheckOpSupport(); + + private: + std::list wait_que_; + std::list wait_que_cpu_; + std::queue exec_que_; + std::vector exec_ops_; + std::vector> ops_in_streams_; + std::unordered_map resources_; + std::unordered_map map_arg_to_lane_; + int max_stream_; + int io_copy_once_num_; + std::unordered_set op_types_set_; +}; + +} // namespace mir +} // namespace lite +} // namespace paddle diff --git a/lite/core/mir/node.h b/lite/core/mir/node.h index 45b15812fadb0789edea3f89fb00b4612bdb010f..ae7b112d9157de3f53c409dfc89bf1273531e05f 100644 --- a/lite/core/mir/node.h +++ b/lite/core/mir/node.h @@ -80,6 +80,12 @@ class Node { // Description. std::string desc; + + // for cuda multi stream + bool need_sync_{false}; + int stream_id_{0}; + // streams which need to be sync. exclude stream_id_ + std::vector sync_streams_{}; }; struct Arg { @@ -93,6 +99,7 @@ class Node { // if the need more than one tool operator(eg. io_copy layout calib), the // argument between them should be persist to make sure it's only run once bool is_persist{false}; + int lane{-1}; }; Arg& AsArg(const std::string& name, int id); diff --git a/lite/core/mir/pass.h b/lite/core/mir/pass.h index 4e8c8be292bbd5e7f46664378634d4f1aeed2965..64f2db82c0b1b0b863c1aa61b3b2affea5f85d89 100644 --- a/lite/core/mir/pass.h +++ b/lite/core/mir/pass.h @@ -17,9 +17,11 @@ #include #include #include +#include #include "lite/core/mir/node.h" #include "lite/core/mir/ssa_graph.h" +#include "lite/utils/varient.h" namespace paddle { namespace lite { @@ -121,6 +123,27 @@ class Pass { virtual ~Pass() = default; + bool HasAttr(const std::string& attr_name) const { + return pass_attrs_.count(attr_name) > 0; + } + + // Set a pointer to the attribute. Specific pass itself takes ownership of the + // attribute. + template + void SetAttr(const std::string& attr_name, const AttrType* attr) { + VLOG(4) << "Setting the attribute " << attr_name << " for the pass " + << name_; + pass_attrs_[attr_name].set(*attr); + } + + // Get a reference to the attribute previously set. + template + const AttrType& GetAttr(const std::string& attr_name) const { + CHECK(pass_attrs_.count(attr_name)) + << attr_name << " attr not register for pass " << name_; + return pass_attrs_.at(attr_name).get(); + } + private: const Kind kind_; std::string name_; @@ -128,6 +151,8 @@ class Pass { std::set bound_targets_; std::set excluded_targets_; std::unordered_map> bound_kernels_; + std::unordered_map>> + pass_attrs_; }; // Different kinds. diff --git a/lite/core/mir/pass_registry.h b/lite/core/mir/pass_registry.h index 849f80aea2191b72ac423c7125a4e69cb6927be5..170de1cd31ffd31662eb98898ad795993a36289e 100644 --- a/lite/core/mir/pass_registry.h +++ b/lite/core/mir/pass_registry.h @@ -59,6 +59,9 @@ class PassRegistry { } // namespace lite } // namespace paddle +// some platform-independent defintion +#include "lite/utils/macros.h" + #define REGISTER_MIR_PASS(name__, class__) \ paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__, \ new class__); \ @@ -66,4 +69,4 @@ class PassRegistry { return mir_pass_registry##name__.Touch(); \ } \ static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__ \ - __attribute__((unused)) = mir_pass_registry##name__ + UNUSED = mir_pass_registry##name__ diff --git a/lite/core/mir/quantized_op_attributes_inference_pass.cc b/lite/core/mir/quantized_op_attributes_inference_pass.cc index 40cad8f6af75300ab85753b16e391daeeadc6c2f..187e6b634fcf9d38cb32b7ca936ac8039c1717cf 100644 --- a/lite/core/mir/quantized_op_attributes_inference_pass.cc +++ b/lite/core/mir/quantized_op_attributes_inference_pass.cc @@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply( REGISTER_MIR_PASS(quantized_op_attributes_inference_pass, paddle::lite::mir::QuantizedOpAttributesInferencePass) - .BindTargets({TARGET(kNPU)}); + .BindTargets({TARGET(kAPU), TARGET(kRKNPU)}); diff --git a/lite/core/mir/runtime_context_assign_pass.cc b/lite/core/mir/runtime_context_assign_pass.cc index 3cbe602f31a87c6ddb42d36fe75e52e8347695d8..5b6f968484b7b49838a004c3edfd00ff9b7e5e5e 100644 --- a/lite/core/mir/runtime_context_assign_pass.cc +++ b/lite/core/mir/runtime_context_assign_pass.cc @@ -45,9 +45,10 @@ class RuntimeContextAssignPass : public StmtPass { inst.picked_kernel().target())); } #else - inst.picked_kernel().SetContext( - ContextScheduler::Global().NewContext(inst.picked_kernel().target())); + int stream_id = inst.stream_id_; + inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext( + inst.picked_kernel().target(), stream_id)); #endif } } diff --git a/lite/core/mir/subgraph/subgraph_detector.cc b/lite/core/mir/subgraph/subgraph_detector.cc index b61f7f365f51a32e267dd12943be5fcfadb3e08a..6bab454c42a68a7513aa01ff06cc2be6c970e199 100644 --- a/lite/core/mir/subgraph/subgraph_detector.cc +++ b/lite/core/mir/subgraph/subgraph_detector.cc @@ -47,8 +47,8 @@ std::string SubgraphVisualizer::operator()() { "turquoise4", "snow3", "sienna4", "salmon2", }; std::unordered_map subgraph_indices; - for (int i = 0; i < subgraphs_.size(); i++) { - for (int j = 0; j < subgraphs_[i].size(); j++) { + for (size_t i = 0; i < subgraphs_.size(); i++) { + for (size_t j = 0; j < subgraphs_[i].size(); j++) { subgraph_indices[subgraphs_[i][j]] = i; } } @@ -538,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph, std::vector> subgraphs = SubgraphDetector(graph, teller)(); SubgraphVisualizer(graph, subgraphs)(); - for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) { + for (size_t subgraph_idx = 0; subgraph_idx < subgraphs.size(); + subgraph_idx++) { if (subgraphs[subgraph_idx].size() >= min_subgraph_size) { InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]); } diff --git a/lite/core/mir/subgraph/subgraph_detector_test.cc b/lite/core/mir/subgraph/subgraph_detector_test.cc index 1e54e1497b5d49754a705340aafa30ded1c2a727..f52c0332fa3cfce904d2b7c8bf010bc3d3ac6ac9 100644 --- a/lite/core/mir/subgraph/subgraph_detector_test.cc +++ b/lite/core/mir/subgraph/subgraph_detector_test.cc @@ -36,8 +36,8 @@ std::vector AddFCDesc( const std::shared_ptr& scope, const std::vector& input_var_names, const std::vector& wshape) { - CHECK_EQ(input_var_names.size(), 1); - CHECK_EQ(wshape.size(), 2); + CHECK_EQ(input_var_names.size(), 1u); + CHECK_EQ(wshape.size(), 2u); static int id = 0; std::string prefix = "fc_" + paddle::lite::to_string(id); auto* op_desc = block_desc->AddOp(); @@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) { }; std::vector> subgraphs = mir::SubgraphDetector(graph.get(), teller)(); - ASSERT_EQ(subgraphs.size(), 1); - ASSERT_EQ(graph->nodes().size(), 9); + ASSERT_EQ(subgraphs.size(), 1u); + ASSERT_EQ(graph->nodes().size(), 9u); mir::SubgraphVisualizer(graph.get(), subgraphs)(); } @@ -221,7 +221,7 @@ TEST(Subgraph, detect_custom_model) { std::vector> subgraphs = mir::SubgraphDetector(graph.get(), teller)(); mir::SubgraphVisualizer(graph.get(), subgraphs)(); - ASSERT_EQ(subgraphs.size(), 1); + ASSERT_EQ(subgraphs.size(), 1u); } } // namespace lite diff --git a/lite/core/mir/subgraph/subgraph_pass.cc b/lite/core/mir/subgraph/subgraph_pass.cc index eecd9348ae684929d3f55dee2a94921a078f148c..663b69d38843555095957f30d652ba8ef6216a0e 100644 --- a/lite/core/mir/subgraph/subgraph_pass.cc +++ b/lite/core/mir/subgraph/subgraph_pass.cc @@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void APUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) \ + supported_lists.insert(#op_type); \ + LOG(INFO) << #op_type +#include "lite/kernels/apu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + void XPUSubgraphPass::Apply(const std::unique_ptr& graph) { if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return; std::unordered_set supported_lists; @@ -69,6 +85,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr& graph) { fuser(); } +void RKNPUSubgraphPass::Apply(const std::unique_ptr& graph) { + std::unordered_set supported_lists; +#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); +#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h" +#undef USE_SUBGRAPH_BRIDGE + auto teller = [&](Node* node) { + if (!node->IsStmt()) return false; + auto& stmt = node->AsStmt(); + return supported_lists.count(stmt.op_type()) != 0; + }; + SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */); + fuser(); +} + void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { std::unordered_set supported_lists; #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); @@ -89,9 +119,13 @@ void MLUSubgraphPass::Apply(const std::unique_ptr& graph) { REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass) .BindTargets({TARGET(kNPU)}); +REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass) + .BindTargets({TARGET(kAPU)}); REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) .BindTargets({TARGET(kXPU)}); REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) .BindTargets({TARGET(kBM)}); +REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass) + .BindTargets({TARGET(kRKNPU)}); REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass) .BindTargets({TARGET(kMLU)}); diff --git a/lite/core/mir/subgraph/subgraph_pass.h b/lite/core/mir/subgraph/subgraph_pass.h index f83448df42ffe6d6d8c5b37503b5127290037dce..8c2b501a62356c91e93f3c4ca91f70879d3c9229 100644 --- a/lite/core/mir/subgraph/subgraph_pass.h +++ b/lite/core/mir/subgraph/subgraph_pass.h @@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class APUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + class XPUSubgraphPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; @@ -37,6 +42,11 @@ class BMSubgraphPass : public ProgramPass { void Apply(const std::unique_ptr& graph) override; }; +class RKNPUSubgraphPass : public ProgramPass { + public: + void Apply(const std::unique_ptr& graph) override; +}; + class MLUSubgraphPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; diff --git a/lite/core/mir/subgraph/subgraph_pass_test.cc b/lite/core/mir/subgraph/subgraph_pass_test.cc index a2369adc5d882310503cbf52fa5394098d824b40..c638793c08160eb8ee7edabeab0977541e85d82a 100644 --- a/lite/core/mir/subgraph/subgraph_pass_test.cc +++ b/lite/core/mir/subgraph/subgraph_pass_test.cc @@ -39,7 +39,7 @@ std::vector> ShapeParsing(std::string text) { std::vector> shapes; std::vector shape_strings = Split(text, ":"); shapes.resize(shape_strings.size()); - for (int i = 0; i < shape_strings.size(); i++) { + for (size_t i = 0; i < shape_strings.size(); i++) { std::vector shape_nums = Split(shape_strings[i], ","); for (auto shape_num : shape_nums) { shapes[i].push_back(atoi(shape_num.c_str())); @@ -66,7 +66,7 @@ void FillInputTensors( for (int j = 0; j < input_tensor_size; j++) { \ input_tensor_data[j] = static_cast(value); \ } - for (int i = 0; i < input_tensor_shape.size(); i++) { + for (size_t i = 0; i < input_tensor_shape.size(); i++) { auto input_tensor = predictor->GetInput(i); input_tensor->Resize(input_tensor_shape[i]); auto input_tensor_size = ShapeProduction(input_tensor->shape()); @@ -95,7 +95,7 @@ void CheckOutputTensors( << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; \ EXPECT_LT(rel_diff, 0.1); \ } - for (int i = 0; i < output_tensor_type.size(); i++) { + for (size_t i = 0; i < output_tensor_type.size(); i++) { auto tar_output_tensor = tar_predictor->GetOutput(i); auto ref_output_tensor = ref_predictor->GetOutput(i); auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape()); diff --git a/lite/core/mir/type_precision_cast_pass.cc b/lite/core/mir/type_precision_cast_pass.cc index ecccf89fa76287a3f30756f7138fcce229e8f337..121e64dc188eeb638becec3506b514bc24dad16d 100644 --- a/lite/core/mir/type_precision_cast_pass.cc +++ b/lite/core/mir/type_precision_cast_pass.cc @@ -80,7 +80,7 @@ static bool InferScaleFromSubgraph(std::string var_name, auto input_or_output_scales = op_info->GetAttr>(attr_name); auto size = input_or_output_names.size(); CHECK(size == input_or_output_scales.size()); - for (int i = 0; i < size; i++) { + for (size_t i = 0; i < size; i++) { if (input_or_output_names[i] == var_name) { *scale = input_or_output_scales[i]; return true; @@ -137,18 +137,23 @@ void PrecisionCastPass::Apply(const std::unique_ptr& graph) { nodes.push_back(node); } + // record the copied node. + std::unordered_map cast_nodes; + for (auto& node : nodes) { if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; auto inlinks = node->inlinks; for (auto* in : inlinks) { - ComplementInputs(graph.get(), node, in); + ComplementInputs(graph.get(), node, in, &cast_nodes); } } } -void PrecisionCastPass::ComplementInputs(SSAGraph* graph, - Node* inst_node, - Node* in) { +void PrecisionCastPass::ComplementInputs( + SSAGraph* graph, + Node* inst_node, + Node* in, + std::unordered_map* cast_nodes) { // If this input is out of date. if (inst_node->inlinks.end() == std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in)) @@ -184,16 +189,19 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph, in, graph, inst_node, + cast_nodes, graph->valid_places()); } } -void PrecisionCastPass::AddCastInst(const Type& from, - const Type& to, - Node* in, - SSAGraph* graph, - Node* inst_node, - const std::vector& valid_places) { +void PrecisionCastPass::AddCastInst( + const Type& from, + const Type& to, + Node* in, + SSAGraph* graph, + Node* inst_node, + std::unordered_map* cast_nodes, + const std::vector& valid_places) { CHECK(!valid_places.empty()) << "valid_place should be set"; // var -> new_transform_op -> new_var -> inst @@ -203,66 +211,80 @@ void PrecisionCastPass::AddCastInst(const Type& from, auto cast_op_output_name = in->AsArg().name + "/precision_trans"; // in->AsArg().name + "/precision_trans/" + // paddle::lite::to_string(node_id()); - auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); - cast_op_output_arg->AsArg().type = - LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); - auto* cast_inst = graph->NewInstructNode(); + if (cast_nodes->count(in->AsArg().name)) { + // Remove the old link + RemoveDirectedLink(in, inst_node); + // Update the original instruction OpDesc. + // Update its input to the cast_op_output_name + // Add new link, newarg->inst + DirectedLink(cast_nodes->at(in->AsArg().name), + inst_node); // [io_copy kernel]'s output -> [current kernel] + // reset opdesc and update kernel information + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); + } else { + auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); + cast_op_output_arg->AsArg().type = + LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); + auto* cast_inst = graph->NewInstructNode(); - // create Op and kernels. - bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; - std::string cast_type = in_persist ? "calib_once" : "calib"; - cast_op_output_arg->AsArg().is_persist = in_persist; - auto cast_op = LiteOpRegistry::Global().Create(cast_type); - CHECK(cast_op) << "create op [" << cast_op << "] failed"; + // create Op and kernels. + bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; + std::string cast_type = in_persist ? "calib_once" : "calib"; + cast_op_output_arg->AsArg().is_persist = in_persist; + auto cast_op = LiteOpRegistry::Global().Create(cast_type); + CHECK(cast_op) << "create op [" << cast_op << "] failed"; - // Create the new var manually. - inst_node->AsStmt().op()->scope()->Var(cast_op_output_name); + // Create the new var manually. + inst_node->AsStmt().op()->scope()->Var(cast_op_output_name); - // Create Calib Instruction. - cpp::OpDesc op_desc; - op_desc.SetType(cast_type); - op_desc.SetInput("Input", {in->AsArg().name}); - op_desc.SetOutput("Out", {cast_op_output_name}); - float scale; - if (InferScale(in, inst_node, &scale)) { - op_desc.SetAttr("scale", scale); - } + // Create Calib Instruction. + cpp::OpDesc op_desc; + op_desc.SetType(cast_type); + op_desc.SetInput("Input", {in->AsArg().name}); + op_desc.SetOutput("Out", {cast_op_output_name}); + float scale; + if (InferScale(in, inst_node, &scale)) { + op_desc.SetAttr("scale", scale); + } - cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); - auto kernels = cast_op->CreateKernels(valid_places); - std::vector> selected_kernels; - bool is_found = false; - for (auto& kernel : kernels) { - const Type* in_arg_ty = kernel->GetInputDeclType("Input"); - const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); - if (TypeCompatible(*in_arg_ty, from) && - out_arg_ty->precision() == to.precision()) { - is_found = true; - selected_kernels.emplace_back(std::move(kernel)); - // we pick the kernel - cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op); - break; + cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); + auto kernels = cast_op->CreateKernels(valid_places); + std::vector> selected_kernels; + bool is_found = false; + for (auto& kernel : kernels) { + const Type* in_arg_ty = kernel->GetInputDeclType("Input"); + const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); + if (TypeCompatible(*in_arg_ty, from) && + out_arg_ty->precision() == to.precision()) { + is_found = true; + selected_kernels.emplace_back(std::move(kernel)); + // we pick the kernel + cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op); + (*cast_nodes)[in->AsArg().name] = cast_op_output_arg; + break; + } } - } - CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":" - << in->AsArg().name << "->" << to << ":" - << inst_node->AsStmt().op_info()->Type(); + CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":" + << in->AsArg().name << "->" << to << ":" + << inst_node->AsStmt().op_info()->Type(); - // Remove the old link - RemoveDirectedLink(in, inst_node); + // Remove the old link + RemoveDirectedLink(in, inst_node); - // Update the original instruction OpDesc. - // Update its input to the io_copy_output_name + // Update the original instruction OpDesc. + // Update its input to the io_copy_output_name - // Add new link, var -> new_inst, new_inst->newarg, newarg->inst - DirectedLink(in, cast_inst); - DirectedLink(cast_inst, cast_op_output_arg); - DirectedLink(cast_op_output_arg, inst_node); + // Add new link, var -> new_inst, new_inst->newarg, newarg->inst + DirectedLink(in, cast_inst); + DirectedLink(cast_inst, cast_op_output_arg); + DirectedLink(cast_op_output_arg, inst_node); - // reset opdesc and update kernel information - UpdateInputs( - inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); + // reset opdesc and update kernel information + UpdateInputs( + inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); + } // recreate the op auto original_selected_kernel = diff --git a/lite/core/mir/type_precision_cast_pass.h b/lite/core/mir/type_precision_cast_pass.h index b5f7c5d902a998e369f0b1775c59f50cbf8dc256..d8d6af5fcd06c187029c7c16a74efade0d4bd5ca 100644 --- a/lite/core/mir/type_precision_cast_pass.h +++ b/lite/core/mir/type_precision_cast_pass.h @@ -16,6 +16,7 @@ #include #include +#include #include #include "lite/core/mir/pass.h" #include "lite/core/op_registry.h" @@ -34,13 +35,17 @@ class PrecisionCastPass : public ProgramPass { public: void Apply(const std::unique_ptr& graph) override; - void ComplementInputs(SSAGraph* graph, Node* inst_node, Node* in); + void ComplementInputs(SSAGraph* graph, + Node* inst_node, + Node* in, + std::unordered_map* cast_nodes); void AddCastInst(const Type& from, const Type& to, Node* in, SSAGraph* graph, Node* inst_node, + std::unordered_map* cast_nodes, const std::vector& valid_places); void SetValidPlaces(const std::vector& valid_places); diff --git a/lite/core/mir/weight_quantization_preprocess_pass.cc b/lite/core/mir/weight_quantization_preprocess_pass.cc index c7889a54903f2a1d194fb3eade0bd92670b36699..2bb247871b9500129eeea855677a907cb4fd88b9 100644 --- a/lite/core/mir/weight_quantization_preprocess_pass.cc +++ b/lite/core/mir/weight_quantization_preprocess_pass.cc @@ -22,9 +22,29 @@ namespace paddle { namespace lite { namespace mir { +bool IsAbsMaxQuantizedOp(const OpInfo& op_info) { + bool result = false; + if (op_info.HasAttr("quantization_type") && + op_info.GetAttr("quantization_type") == + "post_weight_abs_max") { + result = true; + } else if (!op_info.HasAttr("quantization_type") && + op_info.HasAttr("quantize_weight_bits")) { // Support older model, + // save this for now + result = true; + } + return result; +} + +/* + * For abs_max method in WeightQuantization, this pass obtains the scale value + * of conv2d, depthwise_conv2d and mul, expands the scale list, and save the + * list in the quantized ops. +*/ void WeightQuantizationPreprocessPass::Apply( const std::unique_ptr& graph) { - std::vector weight_quantized_op = {"conv2d", "depthwise_conv2d"}; + std::vector weight_quantized_op = { + "conv2d", "depthwise_conv2d", "mul"}; for (auto& node : graph->StmtTopologicalOrder()) { if (node->IsStmt() && std::find(weight_quantized_op.begin(), @@ -32,14 +52,20 @@ void WeightQuantizationPreprocessPass::Apply( node->AsStmt().op_type()) != weight_quantized_op.end()) { auto* scope = node->stmt()->op()->scope(); auto* op_desc = node->stmt()->mutable_op_info(); - if (op_desc->HasAttr("quantize_weight_bits")) { + if (IsAbsMaxQuantizedOp(*op_desc)) { for (auto& input_name : op_desc->input_vars()) { std::string scale_name = input_name + "_quant_scale"; if (op_desc->HasAttr(scale_name)) { - VLOG(5) << "op:" << op_desc->Type() << " input_name:" << input_name; + VLOG(0) << " WeightQuantizationPreprocessPass op:" + << op_desc->Type() << " input_name:" << input_name; auto input_tensor = scope->FindVar(input_name)->GetMutable(); - int weight_out_channel = static_cast(input_tensor->dims()[0]); + int weight_out_channel; + if (op_desc->Type() == "mul") { + weight_out_channel = static_cast(input_tensor->dims()[1]); + } else { + weight_out_channel = static_cast(input_tensor->dims()[0]); + } auto input_scale = op_desc->GetAttr>(scale_name); // scale length is equal to weight out channel std::vector scale_list(weight_out_channel, input_scale[0]); diff --git a/lite/core/mir/weight_quantization_preprocess_pass.h b/lite/core/mir/weight_quantization_preprocess_pass.h index 76a35c6b443c692ec08688abd4c10680be62b8af..e7c9f03eef78bdafea204d30c78cf0d044bb15e9 100644 --- a/lite/core/mir/weight_quantization_preprocess_pass.h +++ b/lite/core/mir/weight_quantization_preprocess_pass.h @@ -25,8 +25,9 @@ namespace mir { * If the model is quantized by WeightQuantization in PostTrainingQuantization, * the data type of the weight in quantized ops (conv2d, depthwise_conv2d) is * int, and the scale is save in the quantized ops. - * WeightQuantizationPreprocessPass obtains the scale value, expands the - * scale value to a list, and save the list in the quantized ops. + * For abs_max method in WeightQuantization, WeightQuantizationPreprocessPass + * obtains the scale value of conv2d, depthwise_conv2d and mul, expands the + * scale list, and save the list in the quantized ops. */ class WeightQuantizationPreprocessPass : public ProgramPass { public: diff --git a/lite/core/op_lite.cc b/lite/core/op_lite.cc index f8a706179374a0c86e28cf9a3638f5df2c932540..941a9e9f88cf04ef47487237b1a3f6509dea762b 100644 --- a/lite/core/op_lite.cc +++ b/lite/core/op_lite.cc @@ -41,7 +41,7 @@ bool OpLite::InferShapeWithCache() { iter++) { // combined dims value into new_hash value. auto &element_dims = (*iter)->dims(); - for (int i = 0; i < element_dims.size(); i++) { + for (size_t i = 0; i < element_dims.size(); i++) { new_hash = lite::hash_combine(new_hash, static_cast(element_dims[i])); } @@ -49,7 +49,7 @@ bool OpLite::InferShapeWithCache() { auto &emement_lods = (*iter)->lod(); for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end(); lod_iter++) { - for (int i = 0; i < lod_iter->size(); i++) { + for (size_t i = 0; i < lod_iter->size(); i++) { new_hash = lite::hash_combine(new_hash, static_cast(lod_iter->at(i))); } @@ -60,7 +60,7 @@ bool OpLite::InferShapeWithCache() { // if current hash value is consistent with io_shape_lod_hash_, // previous outputs shape and lod are reused. auto *current_outputs = param_.output_tensor_ptrs(); - for (int i = 0; i < current_outputs->size(); i++) { + for (size_t i = 0; i < current_outputs->size(); i++) { current_outputs->at(i)->Resize(last_output_shapes[i]); current_outputs->at(i)->set_lod(last_output_lods[i]); } @@ -69,7 +69,7 @@ bool OpLite::InferShapeWithCache() { io_shape_lod_hash_ = new_hash; this->InferShapeImpl(); auto *current_outputs = param_.output_tensor_ptrs(); - for (int i = 0; i < current_outputs->size(); i++) { + for (size_t i = 0; i < current_outputs->size(); i++) { last_output_shapes[i] = current_outputs->at(i)->dims(); last_output_lods[i] = current_outputs->at(i)->lod(); } diff --git a/lite/core/op_registry.cc b/lite/core/op_registry.cc index 84f54b57b86c012ac72e367d657263b156e6c301..29c853c70caa80add9d47182da228a36f031cb42 100644 --- a/lite/core/op_registry.cc +++ b/lite/core/op_registry.cc @@ -98,6 +98,9 @@ std::list> KernelRegistry::Create( case TARGET(kNPU): { CREATE_KERNEL(kNPU); } break; + case TARGET(kAPU): { + CREATE_KERNEL(kAPU); + } break; case TARGET(kXPU): { CREATE_KERNEL(kXPU); } break; @@ -110,6 +113,9 @@ std::list> KernelRegistry::Create( case TARGET(kMLU): { CREATE_KERNEL(kMLU); } break; + case TARGET(kRKNPU): { + CREATE_KERNEL(kRKNPU); + } break; default: CHECK(false) << "not supported kernel target " << TargetToStr(target); } @@ -151,16 +157,30 @@ KernelRegistry::KernelRegistry() INIT_FOR(kMLU, kInt16, kNHWC); INIT_FOR(kMLU, kInt16, kNCHW); - INIT_FOR(kHost, kFloat, kNCHW); - INIT_FOR(kHost, kInt32, kNCHW); - INIT_FOR(kHost, kInt64, kNCHW); INIT_FOR(kHost, kAny, kNCHW); - INIT_FOR(kHost, kFloat, kNHWC); - INIT_FOR(kHost, kFloat, kAny); - INIT_FOR(kHost, kAny, kNHWC); - INIT_FOR(kHost, kAny, kAny); INIT_FOR(kHost, kAny, kNHWC); INIT_FOR(kHost, kAny, kAny); + INIT_FOR(kHost, kBool, kNCHW); + INIT_FOR(kHost, kBool, kNHWC); + INIT_FOR(kHost, kBool, kAny); + INIT_FOR(kHost, kFloat, kNCHW); + INIT_FOR(kHost, kFloat, kNHWC); + INIT_FOR(kHost, kFloat, kAny); + INIT_FOR(kHost, kFP16, kNCHW); + INIT_FOR(kHost, kFP16, kNHWC); + INIT_FOR(kHost, kFP16, kAny); + INIT_FOR(kHost, kInt8, kNCHW); + INIT_FOR(kHost, kInt8, kNHWC); + INIT_FOR(kHost, kInt8, kAny); + INIT_FOR(kHost, kInt16, kNCHW); + INIT_FOR(kHost, kInt16, kNHWC); + INIT_FOR(kHost, kInt16, kAny); + INIT_FOR(kHost, kInt32, kNCHW); + INIT_FOR(kHost, kInt32, kNHWC); + INIT_FOR(kHost, kInt32, kAny); + INIT_FOR(kHost, kInt64, kNCHW); + INIT_FOR(kHost, kInt64, kNHWC); + INIT_FOR(kHost, kInt64, kAny); INIT_FOR(kX86, kFloat, kNCHW); INIT_FOR(kX86, kAny, kNCHW); @@ -203,6 +223,7 @@ KernelRegistry::KernelRegistry() INIT_FOR(kNPU, kAny, kNHWC); INIT_FOR(kNPU, kAny, kAny); + INIT_FOR(kAPU, kInt8, kNCHW); INIT_FOR(kXPU, kFloat, kNCHW); INIT_FOR(kXPU, kInt8, kNCHW); INIT_FOR(kXPU, kAny, kNCHW); @@ -218,6 +239,11 @@ KernelRegistry::KernelRegistry() INIT_FOR(kBM, kInt8, kNCHW); INIT_FOR(kBM, kAny, kNCHW); INIT_FOR(kBM, kAny, kAny); + + INIT_FOR(kRKNPU, kFloat, kNCHW); + INIT_FOR(kRKNPU, kInt8, kNCHW); + INIT_FOR(kRKNPU, kAny, kNCHW); + INIT_FOR(kRKNPU, kAny, kAny); #undef INIT_FOR } diff --git a/lite/core/op_registry.h b/lite/core/op_registry.h index 96c9fc2358199594cf9590385c2efdaf1c671425..7d73155ac067da4bfd112661d9061c008c1ccef1 100644 --- a/lite/core/op_registry.h +++ b/lite/core/op_registry.h @@ -231,6 +231,9 @@ class KernelRegistry final { PRECISION(kInt8), DATALAYOUT(kNCHW)> *, // + KernelRegistryForTarget *, // KernelRegistryForTarget *, // @@ -251,6 +254,16 @@ class KernelRegistry final { PRECISION(kInt8), DATALAYOUT(kNCHW)> *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // + KernelRegistryForTarget *, // @@ -435,32 +448,31 @@ class KernelRegistor : public lite::Registor { #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \ LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__) -#define REGISTER_LITE_KERNEL( \ - op_type__, target__, precision__, layout__, KernelClass, alias__) \ - static paddle::lite::KernelRegistor \ - LITE_KERNEL_REGISTER_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__)(#op_type__, \ - #alias__); \ - static KernelClass LITE_KERNEL_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__); \ - int touch_##op_type__##target__##precision__##layout__##alias__() { \ - OpKernelInfoCollector::Global().AddKernel2path( \ - #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ - __FILE__); \ - LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ - .Touch(); \ - return 0; \ - } \ - static bool LITE_KERNEL_PARAM_INSTANCE( \ - op_type__, target__, precision__, layout__, alias__) \ - __attribute__((unused)) = \ - paddle::lite::ParamTypeRegistry::NewInstance( \ - #op_type__ "/" #alias__) +#define REGISTER_LITE_KERNEL( \ + op_type__, target__, precision__, layout__, KernelClass, alias__) \ + static paddle::lite::KernelRegistor \ + LITE_KERNEL_REGISTER_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__)(#op_type__, \ + #alias__); \ + static KernelClass LITE_KERNEL_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__); \ + int touch_##op_type__##target__##precision__##layout__##alias__() { \ + OpKernelInfoCollector::Global().AddKernel2path( \ + #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ + __FILE__); \ + LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ + .Touch(); \ + return 0; \ + } \ + static bool LITE_KERNEL_PARAM_INSTANCE( \ + op_type__, target__, precision__, layout__, alias__) UNUSED = \ + paddle::lite::ParamTypeRegistry::NewInstance( \ + #op_type__ "/" #alias__) #define LITE_KERNEL_INSTANCE( \ op_type__, target__, precision__, layout__, alias__) \ diff --git a/lite/core/optimizer.h b/lite/core/optimizer.h index 80c2bd553f6b8073e55d28ef0115246266a6a1c9..83df76f0230f666ec3857834e234afd921daa927 100644 --- a/lite/core/optimizer.h +++ b/lite/core/optimizer.h @@ -86,6 +86,8 @@ class Optimizer { "npu_subgraph_pass", "xpu_subgraph_pass", "bm_subgraph_pass", + "apu_subgraph_pass", + "rknpu_subgraph_pass", "static_kernel_pick_pass", // pick original kernel from graph "variable_place_inference_pass", // inference arg/var's // info(target/precision/layout/device) @@ -127,7 +129,21 @@ class Optimizer { "memory_optimize_pass"}}; if (passes.size() == 1) { - passes_local.push_back(passes[0]); + // multi_stream_analysis_pass must be in the front of + // runtime_context_assign_pass + const std::string msa_pass{"multi_stream_analysis_pass"}; + const std::string depend_pass{"runtime_context_assign_pass"}; + if (passes[0] == msa_pass) { + auto iter = + std::find(passes_local.begin(), passes_local.end(), depend_pass); + if (iter != passes_local.end()) { + passes_local.insert(iter, msa_pass); + } else { + CHECK(false) << "Not find " << depend_pass; + } + } else { + passes_local.push_back(passes[0]); + } } RunPasses(passes_local); } else { diff --git a/lite/core/profile/precision_profiler.h b/lite/core/profile/precision_profiler.h index e72d1f54ee858ef10de83ceefb49addae6ea6606..ee581bf5e126f07fcdb1edeb9ab5b570df0c2ade 100644 --- a/lite/core/profile/precision_profiler.h +++ b/lite/core/profile/precision_profiler.h @@ -178,6 +178,13 @@ class PrecisionProfiler { write_result_to_file&& write_tensorfile(in, name); return; } + case PRECISION(kInt64): { + auto ptr = in->data(); + *mean = compute_mean(ptr, in->numel()); + *std_dev = compute_standard_deviation( + ptr, in->numel(), true, *mean); + return; + } default: *mean = -333333333333; *std_dev = -33333333333; diff --git a/lite/core/program.cc b/lite/core/program.cc index ff900c0e23be9a06313babba51e3ce364295231a..5ddf6c0e935a851cc0b3c3eb7554609939ef1cbf 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -72,7 +72,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { std::unordered_map origin_var_maps; auto& main_block = *desc->GetBlock(0); auto var_size = main_block.VarsSize(); - for (int i = 0; i < var_size; i++) { + for (size_t i = 0; i < var_size; i++) { auto v = main_block.GetVar(i); auto name = v->Name(); origin_var_maps.emplace(name, *v); @@ -145,6 +145,11 @@ void RuntimeProgram::Run() { for (auto& inst : instructions_) { #ifndef LITE_WITH_FPGA if (inst.is_feed_fetch_op()) continue; +#endif +#ifdef LITE_WITH_CUDA + if (inst.need_sync()) { + inst.Sync(); + } #endif inst.Run(); #ifdef LITE_WITH_PRECISION_PROFILE diff --git a/lite/core/program.h b/lite/core/program.h index c845a17c52c0c565e339a13e093f3e8f59e8d4a7..9d5fef7c0367d0e0fabf6ecff8b22e5e20a7bb57 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -108,6 +108,18 @@ struct Instruction { bool is_feed_fetch_op() const { return is_feed_fetch_op_; } +#ifdef LITE_WITH_CUDA + bool need_sync() const { + if (kernel_->target() == TargetType::kCUDA) { + return kernel_->mutable_context()->As().need_sync(); + } else { + // the io_copy kernel has synced, so cpu kernels don't need sync.. + return false; + } + } + void Sync() const { kernel_->mutable_context()->As().Sync(); } +#endif + #ifdef LITE_WITH_PROFILE void set_profiler(profile::Profiler* profiler) { profiler_ = profiler; diff --git a/lite/core/tensor.cc b/lite/core/tensor.cc index ecb9935dfd13c09cbd1a20f3833e6ab76161192a..1ae291dd40d19940e93bfda9b0c22f4092ce7988 100644 --- a/lite/core/tensor.cc +++ b/lite/core/tensor.cc @@ -100,7 +100,7 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) { void TensorLite::ResetBuffer(std::shared_ptr buffer, size_t memory_size) { - CHECK_EQ(offset_, 0) + CHECK_EQ(offset_, 0u) << "Only the offset is supported to zero when the Buffer is reset."; if (buffer_) { CHECK_LE(memory_size_, buffer->space()) diff --git a/lite/core/types.cc b/lite/core/types.cc index 4ea383333d519ac2c481dce459ca49124a64df32..a19c5ed0a33986237ce03213875929d34a2fb363 100644 --- a/lite/core/types.cc +++ b/lite/core/types.cc @@ -67,31 +67,31 @@ STL::ostream& operator<<(STL::ostream& os, const KernelPickFactor& k) { template <> Type StdTypeToRepr() { - return Type::_int32; + return Type::INT32; } template <> Type StdTypeToRepr() { - return Type::_int64; + return Type::INT64; } template <> Type StdTypeToRepr() { - return Type::_float32; + return Type::FLOAT32; } template <> Type StdTypeToRepr() { - return Type::_float64; + return Type::Float64; } template <> Type StdTypeToRepr>() { - return Type::_char_list; + return Type::CHARLIST; } template <> Type StdTypeToRepr() { - return Type::_string; + return Type::STRING; } template <> Type StdTypeToRepr() { - return Type::_bool; + return Type::BOOL; } } // namespace core diff --git a/lite/core/types.h b/lite/core/types.h index 8f154f9dd509d3627750ecbf301923a2296252d1..66dc44746a7496d9805e8cc2b6bf2df89b33ddbf 100644 --- a/lite/core/types.h +++ b/lite/core/types.h @@ -29,23 +29,23 @@ namespace core { */ // TODO(Superjomn) unify all the type representation across the lite framework. enum class Type { - _unk = -1, - // primary types - _int32, - _int64, - _float32, - _float64, - _bool, - _string, + UNK = -1, + // primary typesINT32, + INT32, + INT64, + FLOAT32, + Float64, + BOOL, + STRING, // primary list type - _char_list, + CHARLIST, // list types - _list, + LIST, // enum type - _enum, - _float16, + ENUM, + FLOAT16, // number of types - __num__, + NUM, }; enum class FluidType { @@ -81,7 +81,7 @@ enum class FluidType { template Type StdTypeToRepr() { - return Type::_unk; + return Type::UNK; } template <> Type StdTypeToRepr(); @@ -92,6 +92,8 @@ Type StdTypeToRepr(); template <> Type StdTypeToRepr(); template <> +Type StdTypeToRepr(); +template <> Type StdTypeToRepr>(); template <> Type StdTypeToRepr(); diff --git a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc index 0c9da1a76422edae45dfeec5d38556a5e2322a85..2a819883fa316bd1898c063912800b57804218db 100644 --- a/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc +++ b/lite/demo/cxx/mobile_full/mobilenetv1_full_api.cc @@ -18,6 +18,11 @@ #include "paddle_api.h" // NOLINT #include "paddle_use_passes.h" // NOLINT +#if defined(_WIN32) +#include "paddle_use_kernels.h" // NOLINT +#include "paddle_use_ops.h" // NOLINT +#endif + using namespace paddle::lite_api; // NOLINT DEFINE_string(model_dir, "", "Model dir path."); diff --git a/lite/demo/python/mobilenetv1_full_api.py b/lite/demo/python/mobilenetv1_full_api.py index a31469e3e8da81f3753dc5d241d4ef39ac03832f..c3a6bd077be5978f1ecaf9b040b119e50117d62b 100644 --- a/lite/demo/python/mobilenetv1_full_api.py +++ b/lite/demo/python/mobilenetv1_full_api.py @@ -23,7 +23,7 @@ import argparse import sys sys.path.append('../../python/lib') -from lite_core import * +from paddlelite.lite import * # Command arguments parser = argparse.ArgumentParser() diff --git a/lite/demo/python/mobilenetv1_light_api.py b/lite/demo/python/mobilenetv1_light_api.py index a44427092bae88aa41b3b1d0684cfcf36835b3d2..5847c7819366b654dd9d5b5cbe2108b54da7b04c 100644 --- a/lite/demo/python/mobilenetv1_light_api.py +++ b/lite/demo/python/mobilenetv1_light_api.py @@ -23,7 +23,7 @@ import argparse import sys sys.path.append('../../python/lib') -from lite_core import * +from paddlelite.lite import * # Command arguments parser = argparse.ArgumentParser() diff --git a/lite/fluid/data_type.cc b/lite/fluid/data_type.cc index d33a77c4bfcefbc349d453de05dcbb7c27707a19..9c96459993e55b441ea795c4f2cb58f40846c0d9 100644 --- a/lite/fluid/data_type.cc +++ b/lite/fluid/data_type.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h #include "lite/fluid/data_type.h" #include #include diff --git a/lite/gen_code/CMakeLists.txt b/lite/gen_code/CMakeLists.txt index 40c95415546d99a66abf2d6f3595ae8695c4df86..2416278ad74068d28f6de523c55513891b08cc72 100644 --- a/lite/gen_code/CMakeLists.txt +++ b/lite/gen_code/CMakeLists.txt @@ -15,6 +15,7 @@ lite_cc_test(test_gen_code SRCS gen_code_test.cc X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} @@ -43,6 +44,7 @@ lite_cc_test(test_generated_code SRCS generated_code_test.cc DEPS __generated_co X86_DEPS ${x86_kernels} ARM_DEPS ${arm_kernels} NPU_DEPS ${npu_kernels} + RKNPU_DEPS ${rknpu_kernels} XPU_DEPS ${xpu_kernels} CL_DEPS ${opencl_kernels} FPGA_DEPS ${fpga_kernels} diff --git a/lite/kernels/CMakeLists.txt b/lite/kernels/CMakeLists.txt index 78bb8d10b798b73861ddbf25e427289fc2984a55..17a836b17183d69b0e2a15b46b7a2097c323312f 100644 --- a/lite/kernels/CMakeLists.txt +++ b/lite/kernels/CMakeLists.txt @@ -11,4 +11,6 @@ add_subdirectory(fpga) add_subdirectory(npu) add_subdirectory(xpu) add_subdirectory(mlu) +add_subdirectory(apu) add_subdirectory(bm) +add_subdirectory(rknpu) diff --git a/lite/kernels/apu/CMakeLists.txt b/lite/kernels/apu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..25182e2e20f9204e4dfd62b72c650ac0b07f3318 --- /dev/null +++ b/lite/kernels/apu/CMakeLists.txt @@ -0,0 +1,3 @@ +add_subdirectory(bridges) + +add_kernel(subgraph_compute_apu APU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_apu subgraph_bridge_engine ${apu_subgraph_bridges}) diff --git a/lite/kernels/apu/bridges/CMakeLists.txt b/lite/kernels/apu/bridges/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ac4670f04e0fc7711a898476c1f9bd0c016127c --- /dev/null +++ b/lite/kernels/apu/bridges/CMakeLists.txt @@ -0,0 +1,30 @@ +if(NOT LITE_WITH_APU) + return() +endif() + + +lite_cc_library(subgraph_bridge_utility_apu SRCS utility.cc DEPS tensor) +lite_cc_library(subgraph_bridge_graph_apu SRCS graph.cc DEPS subgraph_bridge_utility_apu) + +set(apu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_apu subgraph_bridge_graph_apu) + +lite_cc_library(subgraph_bridge_conv_op_apu SRCS conv_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_elementwise_ops_apu SRCS elementwise_ops.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps}) +lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps}) + + +set(apu_subgraph_bridges + subgraph_bridge_registry + subgraph_bridge_utility_apu + subgraph_bridge_conv_op_apu + subgraph_bridge_elementwise_ops_apu + subgraph_bridge_act_op_apu + subgraph_bridge_softmax_op_apu + subgraph_bridge_fc_op_apu + subgraph_bridge_pool_op_apu + CACHE INTERNAL "apu_subgraph_bridges") + +message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}") diff --git a/lite/kernels/apu/bridges/act_op.cc b/lite/kernels/apu/bridges/act_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c2451d640eb52f6da88c4cd91bbf4ccd95f49152 --- /dev/null +++ b/lite/kernels/apu/bridges/act_op.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + + return SUCCESS; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(relu, kAPU, paddle::lite::subgraph::apu::ActConverter); diff --git a/lite/kernels/apu/bridges/conv_op.cc b/lite/kernels/apu/bridges/conv_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..859ad777ae58c3be0f36290adb47356f90c795ce --- /dev/null +++ b/lite/kernels/apu/bridges/conv_op.cc @@ -0,0 +1,565 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/conv_op.h" +#include +#include +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int ConvConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + int neuron_errCode; + + VLOG(3) << "[APU] Converting [" << op_type << "]"; + auto libHandle = graph->libHandle(); + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand) + LOAD_FUNCTIONS( + libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue) + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation) + LOAD_FUNCTIONS(libHandle, + NeuronModel_setOperandSymmPerChannelQuantParams, + neuron_model_setOperandSymmPerChannelQuantParams) + + // Get input and output vars and op attributes + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + + auto filter_name = op_info->Input("Filter").front(); + auto filter = scope->FindMutableTensor(filter_name); + auto filter_dims = filter->dims(); + + auto output_name = op_info->Output("Output").front(); + auto output = scope->FindMutableTensor(output_name); + auto output_dims = output->dims(); + + auto bs = input_dims[0]; + auto ic = input_dims[1]; + auto oc = filter_dims[0]; + CHECK_EQ(input_dims.size(), 4L); + CHECK_EQ(output_dims.size(), 4L); + CHECK_EQ(filter_dims.size(), 4L); + CHECK_EQ(output_dims[0], bs); + CHECK_EQ(output_dims[1], oc); + auto strides = op_info->GetAttr>("strides"); + auto paddings = op_info->GetAttr>("paddings"); + auto groups = op_info->GetAttr("groups"); + auto dilations = op_info->GetAttr>("dilations"); + bool with_act = + op_info->HasAttr("with_act") && op_info->GetAttr("with_act"); + std::string act_type = + with_act ? op_info->GetAttr("act_type") : ""; + float leaky_relu_alpha = act_type == "leaky_relu" + ? op_info->GetAttr("leaky_relu_alpha") + : 0.f; + CHECK_EQ(strides.size(), 2L); + CHECK_EQ(dilations.size(), 2L); + bool is_depthwise_mode = ic == groups && oc == groups; + VLOG(3) << "is_depthwise_mode" << is_depthwise_mode; + + if (paddings.size() == 2L) { + for (size_t i = 0; i < strides.size(); ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + + CHECK_EQ(paddings.size(), 4L) + << "[APU] Paddings size should be the same or twice as the input size." + << paddings.size(); + + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + operators::UpdatePaddingAndDilation(&paddings, + &dilations, + strides, + padding_algorithm, + input_dims, + filter_dims); + + float input_scale; + float output_scale; + std::vector weight_scale; + if (op_info->HasAttr("enable_int8")) { + if (op_info->GetAttr("enable_int8")) { + if (op_info->HasAttr("input_scale")) + input_scale = op_info->GetAttr("input_scale"); + if (op_info->HasAttr("weight_scale")) + weight_scale = op_info->GetAttr>("weight_scale"); + if (op_info->HasAttr("output_scale")) + output_scale = op_info->GetAttr("output_scale"); + VLOG(3) << "has output scale:" << output_scale; + } else { + return FAILED; + } + } else { + return FAILED; + } + + VLOG(3) << "strides.size(): " << strides.size() << " ,groups: " << groups + << " ,dilations: " << dilations[0] << ":" << dilations[1]; + VLOG(3) << "with_act: " << with_act << " ,act_type:" << act_type; + VLOG(3) << "input_dims: " << input_dims << " ,output_dims: " << output_dims + << " ,weight_scale size: " << weight_scale.size(); + VLOG(3) << "filter_dims: " << filter_dims + << " ,memory_size: " << filter->memory_size() + << " ,data_size: " << filter->data_size(); + + // Add input tensor type + NeuronOperandType inType; + inType.type = NEURON_TENSOR_QUANT8_ASYMM; + inType.scale = input_scale; + inType.zeroPoint = 128; + inType.dimensionCount = input_dims.size(); + std::vector dims_in = {(uint32_t)input_dims[0], + (uint32_t)input_dims[2], + (uint32_t)input_dims[3], + (uint32_t)input_dims[1]}; + inType.dimensions = &dims_in[0]; + + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + VLOG(3) << "Graph has " << input_name; + // input operand already exist + input_node = graph->Get(input_name); + } else { + // add input operand + if (graph->IsInput(input_name)) { + // Insert transpose for NCHW -> NHWC + insert_transpose_node( + ctx, + input_name, + "transpose_" + input_name, + {input_dims[0], input_dims[1], input_dims[2], input_dims[3]}, + dims_in, + {0, 2, 3, 1}, + inType.scale, + inType.zeroPoint); + + // change input_name + input_name = "transpose_" + input_name; + input_node = graph->Get(input_name); + if (input_node == nullptr) return subgraph::FAILED; + } else { + (*neuron_model_addOperand)(model, &inType); // input + input_node = graph->Add(input_name, dims_in); + } + } + VLOG(3) << "input node idx" << input_node->index() + << ": input_scale: " << input_scale + << ", inType: " << inType.dimensions[0] << ":" << inType.dimensions[1] + << ":" << inType.dimensions[2] << ":" << inType.dimensions[3]; + + // Add bias type + NeuronOperandType biasType; + + // Add filter type + // filter NCHW -> NHWC + Tensor transpose_filter; + std::vector dims_filter; + + if (is_depthwise_mode) { + transpose_filter.Resize({1, + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[0]}); + dims_filter = {1, + (uint32_t)filter_dims[0], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3]}; + transpose(filter->data(), + transpose_filter.mutable_data(), + dims_filter, + {0, 2, 3, 1}); + + dims_filter = {(uint32_t)filter_dims[1], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[0]}; + } else { + transpose_filter.Resize({(uint32_t)filter_dims[0], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[1]}); + dims_filter = {(uint32_t)filter_dims[0], + (uint32_t)filter_dims[1], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3]}; + transpose(filter->data(), + transpose_filter.mutable_data(), + dims_filter, + {0, 2, 3, 1}); + + dims_filter = {(uint32_t)filter_dims[0], + (uint32_t)filter_dims[2], + (uint32_t)filter_dims[3], + (uint32_t)filter_dims[1]}; + } + + NeuronOperandType filterType; + NeuronOperandType channelFilterType; + NeuronSymmPerChannelQuantParams symmPerChannelQuantParams; + if (1 == weight_scale.size()) { + // Per layer type + filterType.type = NEURON_TENSOR_QUANT8_ASYMM; + filterType.scale = weight_scale[0]; + filterType.zeroPoint = 128; + filterType.dimensionCount = filter_dims.size(); + filterType.dimensions = &dims_filter[0]; + biasType.scale = inType.scale * filterType.scale; + } else { + // Per channel type + channelFilterType.type = NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL; + channelFilterType.scale = 0.0f; + channelFilterType.zeroPoint = 0; + channelFilterType.dimensionCount = filter_dims.size(); + channelFilterType.dimensions = &dims_filter[0]; + + // Per channel setting + if (is_depthwise_mode) + symmPerChannelQuantParams.channelDim = 3; + else + symmPerChannelQuantParams.channelDim = 0; + symmPerChannelQuantParams.scaleCount = weight_scale.size(); + symmPerChannelQuantParams.scales = weight_scale.data(); + biasType.scale = 0; + } + + std::shared_ptr filter_node = nullptr; + if (1 == weight_scale.size()) { + (*neuron_model_addOperand)(model, &filterType); // 1: filter + filter_node = graph->Add(filter_name, dims_filter); + VLOG(3) << "filter node idx: " << filter_node->index() << "w_scale[0]" + << weight_scale[0] << ": filterType: " << filterType.dimensions[0] + << ":" << filterType.dimensions[1] << ":" + << filterType.dimensions[2] << ":" << filterType.dimensions[3]; + memcpy(filter->mutable_data(), + transpose_filter.mutable_data(), + filter->memory_size()); + neuron_errCode = (*neuron_model_setOperandValue)( + model, filter_node->index(), filter->raw_data(), filter->memory_size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode; + return subgraph::FAILED; + } + } else { + (*neuron_model_addOperand)(model, &channelFilterType); // 1: filter + filter_node = graph->Add(filter_name, dims_filter); + VLOG(3) << "chennel filter node idx: " << filter_node->index() + << " ,scale_count:" << weight_scale.size() + << " weight_scale[0]:" << weight_scale.data()[0] + << " ,channelFilterType: " << channelFilterType.dimensions[0] << ":" + << channelFilterType.dimensions[1] << ":" + << channelFilterType.dimensions[2] << ":" + << channelFilterType.dimensions[3]; + memcpy(filter->mutable_data(), + transpose_filter.mutable_data(), + filter->memory_size()); + neuron_errCode = (*neuron_model_setOperandValue)( + model, filter_node->index(), filter->raw_data(), filter->memory_size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set filter operand value fail:" << neuron_errCode; + return subgraph::FAILED; + } + neuron_errCode = (*neuron_model_setOperandSymmPerChannelQuantParams)( + model, filter_node->index(), &symmPerChannelQuantParams); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set per channel filter params fail:" << neuron_errCode; + return subgraph::FAILED; + } + } + + // Add biasType node value + // A 1-D tensor, of shape [depth_out], specifying the bias. + // For filter tensor of NEURON_TENSOR_QUANT8_SYMM_PER_CHANNEL, the bias + // should be of ANEURALNETWORKS_TENSOR_INT32, with zeroPoint of 0 + // and bias_scale of 0. The actual scale of each value 'i' is equal + // to bias_scale[i] = input_scale * filter_scale[i]. + biasType.type = NEURON_TENSOR_INT32; + biasType.zeroPoint = 0; + std::vector dims_bias; + std::shared_ptr bias_node = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + + biasType.dimensionCount = bias_dims.size(); + for (int i = 0; i < bias_dims.size(); i++) + dims_bias.push_back(bias_dims[i]); + biasType.dimensions = &dims_bias[0]; + (*neuron_model_addOperand)(model, &biasType); // 2: bias + bias_node = graph->Add(bias_name, dims_bias); + VLOG(3) << "node idx" << bias_node->index() << ": Bias name: " << bias_name + << " ,bias scale: " << biasType.scale + << " ,dimensions: " << bias_dims; + } else { + biasType.dimensionCount = 1; + dims_bias = {(uint32_t)output_dims[1]}; + biasType.dimensions = &dims_bias[0]; + (*neuron_model_addOperand)(model, &biasType); // 2: bias + bias_node = graph->Add(filter_name + "_default_bias", dims_bias); + VLOG(3) << "node idx" << bias_node->index() << ": Bias name: default_bias " + << " ,bias scale: " << biasType.scale + << " ,dimensions: " << dims_bias.size(); + } + + NeuronOperandType int32Type; + int32Type.type = NEURON_INT32; + int32Type.dimensionCount = 0; + std::vector dims_int32 = {1}; + + std::shared_ptr paddingL_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 3: padding left + paddingL_node = graph->Add(filter_name + "_padding_left", dims_int32); + + std::shared_ptr paddingR_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 4: padding right + paddingR_node = graph->Add(filter_name + "_padding_right", dims_int32); + + std::shared_ptr paddingT_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 5: padding top + paddingT_node = graph->Add(filter_name + "_padding_top", dims_int32); + + std::shared_ptr paddingB_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 6: padding bottom + paddingB_node = graph->Add(filter_name + "_padding_bottom", dims_int32); + + std::shared_ptr strideW_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 7: stride width + strideW_node = graph->Add(filter_name + "_stride_width", dims_int32); + + std::shared_ptr strideH_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 8: stride height + strideH_node = graph->Add(filter_name + "_stride_height", dims_int32); + + std::shared_ptr dm_node = nullptr; + if (is_depthwise_mode) { + (*neuron_model_addOperand)(model, &int32Type); // 9: depthwise multiplier + dm_node = graph->Add(filter_name + "_dm", dims_int32); + } + + std::shared_ptr fuse_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 9/10: fuse + fuse_node = graph->Add(filter_name + "_fuse", dims_int32); + + // Add output tensor type + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + if (graph->IsOutput(output_name)) + outType.scale = output_scale / 127; + else + outType.scale = output_scale; + outType.zeroPoint = 128; + outType.dimensionCount = output_dims.size(); + std::vector dims_out = {(uint32_t)output_dims[0], + (uint32_t)output_dims[2], + (uint32_t)output_dims[3], + (uint32_t)output_dims[1]}; + outType.dimensions = &dims_out[0]; + std::shared_ptr output_node = nullptr; + if (graph->Has(output_name)) { + output_node = graph->Get(output_name); + } else { + // add output operand + if (graph->IsOutput(output_name)) { + (*neuron_model_addOperand)(model, &outType); // output + output_node = graph->Add("transpose_" + output_name, dims_out); + } else { + (*neuron_model_addOperand)(model, &outType); // output + output_node = graph->Add(output_name, dims_out); + } + } + VLOG(3) << "output node idx: " << output_node->index() + << ": output_scale: " << outType.scale + << ", outType: " << outType.dimensions[0] << ":" + << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" + << outType.dimensions[3]; + + // Add bias value + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias = scope->FindMutableTensor(bias_name); + int32_t* int32_bias_data = + reinterpret_cast(bias->mutable_data()); + float2int32( + bias->data(), input_scale, weight_scale, int32_bias_data); + + VLOG(3) << "int32_bias_data: " << int32_bias_data[0] << " : " + << int32_bias_data[1] << " : " << int32_bias_data[2] << " : " + << int32_bias_data[3]; + neuron_errCode = (*neuron_model_setOperandValue)( + model, bias_node->index(), bias->raw_data(), bias->memory_size()); + } else { + auto int32_bias = std::make_shared(); + int32_bias->Resize({1, output_dims[1]}); + int32_bias->mutable_data(); + VLOG(3) << "bais_default: " << int32_bias->memory_size(); + memset(int32_bias->mutable_data(), 0, int32_bias->memory_size()); + neuron_errCode = (*neuron_model_setOperandValue)(model, + bias_node->index(), + int32_bias->raw_data(), + int32_bias->memory_size()); + bias_node->set_data(int32_bias); + } + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set bias operand value fail:" << neuron_errCode; + return subgraph::FAILED; + } + + VLOG(3) << "paddings: " << paddings[0] << ":" << paddings[1] << ":" + << paddings[2] << ":" << paddings[3]; + // Add padding value + int32_t padding_val[1]; + padding_val[0] = paddings[2]; + (*neuron_model_setOperandValue)( + model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[3]; + (*neuron_model_setOperandValue)( + model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[0]; + (*neuron_model_setOperandValue)( + model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[1]; + (*neuron_model_setOperandValue)( + model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1); + + VLOG(3) << " stride width:" << strides[1] << " height:" << strides[0]; + + // Add Stride + int32_t stride_val[1]; + stride_val[0] = strides[1]; // width + (*neuron_model_setOperandValue)( + model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); + stride_val[0] = strides[0]; // height + (*neuron_model_setOperandValue)( + model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); + + // Add fuse + int32_t fuse_val[1] = {0}; + if (act_type == "relu") { + fuse_val[0] = 1; + } else if (act_type == "relu1") { + fuse_val[0] = 2; + } else if (act_type == "relu6") { + fuse_val[0] = 3; + } else if (!act_type.empty()) { + fuse_val[0] = 0; + LOG(WARNING) << "Support act_type: " << act_type; + return FAILED; + } + + if (is_depthwise_mode) { + int32_t dm = oc / ic; + (*neuron_model_setOperandValue)( + model, dm_node->index(), &dm, sizeof(int32_t) * 1); + VLOG(3) << "depthwise multiplier:" << dm; + + // Depthwise conv + (*neuron_model_setOperandValue)( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); + std::vector addInIndex = { + input_node->index(), // 0: input + filter_node->index(), // 1: filter + bias_node->index(), // 2: bias + paddingL_node->index(), // 3: padding left + paddingR_node->index(), // 4: padding right + paddingT_node->index(), // 5: padding top + paddingB_node->index(), // 6: padding bottom + strideW_node->index(), // 7: stride width + strideH_node->index(), // 8: stride height + dm_node->index(), // 9: depthwise multiplier + fuse_node->index()}; // 10 : fuse + + std::vector addOutIndex = {output_node->index()}; + neuron_errCode = (*neuron_model_addOperation)(model, + NEURON_DEPTHWISE_CONV_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } else { + (*neuron_model_setOperandValue)( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); + std::vector addInIndex = { + input_node->index(), // 0: input + filter_node->index(), // 1: filter + bias_node->index(), // 2: bias + paddingL_node->index(), // 3: padding left + paddingR_node->index(), // 4: padding right + paddingT_node->index(), // 5: padding top + paddingB_node->index(), // 6: padding bottom + strideW_node->index(), // 7: stride width + strideH_node->index(), // 8: stride height + fuse_node->index()}; // 9: fuse + + std::vector addOutIndex = {output_node->index()}; + neuron_errCode = (*neuron_model_addOperation)(model, + NEURON_CONV_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Add op fail:" << op_type; + return FAILED; + } + + if (graph->IsOutput(output_name)) { + // Insert transpose for NHWC -> NCHW + insert_transpose_node( + ctx, + "transpose_" + output_name, + output_name, + dims_out, + {output_dims[0], output_dims[1], output_dims[2], output_dims[3]}, + {0, 3, 1, 2}, + outType.scale, + outType.zeroPoint); + output_node = graph->Get(output_name); + if (output_node == nullptr) return subgraph::FAILED; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(conv2d, + kAPU, + paddle::lite::subgraph::apu::ConvConverter); +REGISTER_SUBGRAPH_BRIDGE(depthwise_conv2d, + kAPU, + paddle::lite::subgraph::apu::ConvConverter); diff --git a/lite/kernels/apu/bridges/elementwise_ops.cc b/lite/kernels/apu/bridges/elementwise_ops.cc new file mode 100644 index 0000000000000000000000000000000000000000..9c637e0fe746ce2a4d2b42dc902d62279967e73c --- /dev/null +++ b/lite/kernels/apu/bridges/elementwise_ops.cc @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting " + op_type + "..."; + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + + auto y_name = op_info->Input("Y").front(); + auto y = scope->FindMutableTensor(y_name); + auto y_dims = y->dims(); + + auto out_name = op_info->Output("Out").front(); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + auto axis = op_info->GetAttr("axis"); + + // Act node + if (op_type == "fusion_elementwise_add_activation" || + op_type == "fusion_elementwise_sub_activation" || + op_type == "fusion_elementwise_mul_activation" || + op_type == "fusion_elementwise_div_activation") { + auto act_type = op_info->GetAttr("act_type"); + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(elementwise_add, + kAPU, + paddle::lite::subgraph::apu::ElementwiseConverter); +REGISTER_SUBGRAPH_BRIDGE(elementwise_mul, + kAPU, + paddle::lite::subgraph::apu::ElementwiseConverter); diff --git a/lite/kernels/apu/bridges/fc_op.cc b/lite/kernels/apu/bridges/fc_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0d4ffc762e287618c8eb6b31908909cca4af91d1 --- /dev/null +++ b/lite/kernels/apu/bridges/fc_op.cc @@ -0,0 +1,250 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int FCConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting [" + op_type + "]"; + + auto libHandle = graph->libHandle(); + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand) + LOAD_FUNCTIONS( + libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue) + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation) + + auto input_name = op_info->Input("Input").front(); + auto input = scope->FindMutableTensor(input_name); + auto input_dims = input->dims(); + CHECK_GE(input_dims.size(), 2UL); + auto w_name = op_info->Input("W").front(); + auto w = scope->FindMutableTensor(w_name); + auto w_dims = w->dims(); + CHECK_EQ(w_dims.size(), 2UL); + auto out_name = op_info->Output("Out").front(); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + + int in_num_col_dims = op_info->GetAttr("in_num_col_dims"); + int m = input_dims.Slice(0, in_num_col_dims).production(); + int k = input_dims.Slice(in_num_col_dims, input_dims.size()).production(); + int n = w_dims[1]; + CHECK_EQ(k * n, w_dims.production()); + VLOG(3) << "[APU] input dims: " << input_dims << " w dims: " << w_dims + << " out_dims: " << out_dims << " m: " << m << " k: " << k + << " n: " << n; + + float input_scale = 1.0f; + float out_scale = 1.0f; + std::vector w_scale; + if (op_info->HasAttr("enable_int8")) { + if (op_info->GetAttr("enable_int8")) { + if (op_info->HasAttr("input_scale")) + input_scale = op_info->GetAttr("input_scale"); + if (op_info->HasAttr("weight_scale")) + w_scale = op_info->GetAttr>("weight_scale"); + if (op_info->HasAttr("output_scale")) + out_scale = op_info->GetAttr("output_scale"); + } else { + return FAILED; + } + } else { + return FAILED; + } + + // Add input tensor type + NeuronOperandType inType; + inType.type = NEURON_TENSOR_QUANT8_ASYMM; + inType.scale = input_scale; + inType.zeroPoint = 128; + inType.dimensionCount = input_dims.size(); + std::vector dims_in = {(uint32_t)input_dims[0], + (uint32_t)input_dims[2], + (uint32_t)input_dims[3], + (uint32_t)input_dims[1]}; + + inType.dimensions = &dims_in[0]; + std::shared_ptr in_node = nullptr; + if (graph->Has(input_name)) { + // input operand already exist + in_node = graph->Get(input_name); + VLOG(3) << "Graph has " << input_name << ",index: " << in_node->index(); + } else { + // add input operand + (*neuron_model_addOperand)(model, &inType); // 0: input + in_node = graph->Add(input_name, dims_in); + } + VLOG(3) << "input_scale: " << input_scale + << ", inType: " << inType.dimensions[0] << " : " + << inType.dimensions[1] << " : " << inType.dimensions[2] << " : " + << inType.dimensions[3]; + + NeuronOperandType wType; + wType.type = NEURON_TENSOR_QUANT8_ASYMM; + wType.scale = w_scale[0]; + wType.zeroPoint = 128; + wType.dimensionCount = w_dims.size(); + std::vector dims_w = {(uint32_t)w_dims[1], (uint32_t)w_dims[0]}; + wType.dimensions = &dims_w[0]; + (*neuron_model_addOperand)(model, &wType); // 1: weight + std::shared_ptr w_node = nullptr; + w_node = graph->Add(w_name, dims_w); + VLOG(3) << "w_scale size: " << w_scale.size() << ",w_scale: " << w_scale[0] + << ", wType dimensions: " << wType.dimensions[0] << " : " + << wType.dimensions[1] << ", memory size: " << w->memory_size(); + + // Add bias type + NeuronOperandType biasType; + biasType.type = NEURON_TENSOR_INT32; + biasType.zeroPoint = 0; + biasType.scale = input_scale * w_scale[0]; + std::shared_ptr bias_node = nullptr; + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias_type = kernel->GetInputDeclType("Bias"); + auto bias = scope->FindMutableTensor(bias_name); + auto bias_dims = bias->dims(); + + biasType.dimensionCount = bias_dims.size(); + std::vector dims_bias = {(uint32_t)bias_dims[0]}; + biasType.dimensions = &dims_bias[0]; + (*neuron_model_addOperand)(model, &biasType); // 2: bias + bias_node = graph->Add(bias_name, dims_bias); + VLOG(3) << "Bias name: " << bias_name << ", bias dims: " << bias_dims + << ", bias scale: " << biasType.scale + << " ,memory size: " << bias->memory_size(); + } else { + biasType.dimensionCount = 1; + std::vector dims_bias = {(uint32_t)n}; + biasType.dimensions = &dims_bias[0]; + (*neuron_model_addOperand)(model, &biasType); // 2: bias + bias_node = graph->Add(w_name + "_default_bias", dims_bias); + } + + // Add fuse type + NeuronOperandType fuseType; + fuseType.type = NEURON_INT32; + fuseType.dimensionCount = 0; + std::vector dims_int32 = {0}; + (*neuron_model_addOperand)(model, &fuseType); // 3: fuse + std::shared_ptr fuse_node = nullptr; + fuse_node = graph->Add(w_name + "_fuse", dims_int32); + + // Add output tensor type + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = out_scale; + outType.zeroPoint = 128; + outType.dimensionCount = 2; + std::vector dims_out = {(uint32_t)out_dims[0], out_dims[1]}; + outType.dimensions = &dims_out[0]; + VLOG(3) << "out_scale: " << out_scale + << ", outType: " << outType.dimensions[0] << " : " + << outType.dimensions[1]; + (*neuron_model_addOperand)(model, &outType); // output + std::shared_ptr out_node = nullptr; + out_node = graph->Add(out_name, dims_out); + + int8_t* w_data = w->mutable_data(); + Tensor transpose_filter; + // Original dimension + transpose_filter.Resize({(uint32_t)w_dims[1], (uint32_t)w_dims[0]}); + transpose_filter.mutable_data(); + transposeAsym(w->data(), + transpose_filter.mutable_data(), + {1, 1, (uint32_t)w_dims[0], (uint32_t)w_dims[1]}, + {0, 1, 3, 2}); + memcpy(w->mutable_data(), + transpose_filter.mutable_data(), + w->memory_size()); + int neuron_errCode = (*neuron_model_setOperandValue)( + model, w_node->index(), w->raw_data(), w->memory_size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Set W operand value fail:" << neuron_errCode + << ",index: " << w_node->index(); + return FAILED; + } + + // Add bias if bias tensor exists + if (HasInputArg(op_info, scope, "Bias")) { + auto bias_name = op_info->Input("Bias").front(); + auto bias = scope->FindMutableTensor(bias_name); + int32_t* int32_bias_data = + reinterpret_cast(bias->mutable_data()); + float2int32(bias->data(), input_scale, w_scale, int32_bias_data); + + VLOG(3) << int32_bias_data[0] << ":" << int32_bias_data[1] << ":" + << int32_bias_data[2] << ":" << int32_bias_data[3]; + neuron_errCode = + (*neuron_model_setOperandValue)(model, + bias_node->index(), + bias->raw_data(), + bias->memory_size()); // 2: bias + } else { + auto int32_bias = std::make_shared(); + int32_bias->Resize({1, out_dims[1]}); + int32_bias->mutable_data(); + memset(int32_bias->mutable_data(), 0, int32_bias->memory_size()); + VLOG(3) << "default: " << int32_bias->memory_size(); + neuron_errCode = + (*neuron_model_setOperandValue)(model, + bias_node->index(), + int32_bias->raw_data(), + int32_bias->memory_size()); // 2: bias + bias_node->set_data(int32_bias); + } + // Add fuse value + int32_t fuse_val[1] = {0}; + (*neuron_model_setOperandValue)( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); // 3: fuse + + std::vector addInIndex = {in_node->index(), + w_node->index(), + bias_node->index(), + fuse_node->index()}; + std::vector addOutIndex = {out_node->index()}; + neuron_errCode = (*neuron_model_addOperation)(model, + NEURON_FULLY_CONNECTED, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Add op fail:" << op_type; + return FAILED; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(fc, kAPU, paddle::lite::subgraph::apu::FCConverter); diff --git a/lite/kernels/apu/bridges/graph.cc b/lite/kernels/apu/bridges/graph.cc new file mode 100644 index 0000000000000000000000000000000000000000..515853aa26a1d84339c61047b5d3be20478b5ca3 --- /dev/null +++ b/lite/kernels/apu/bridges/graph.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/graph.h" +#include +#include "lite/kernels/apu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int Graph::Add(const std::string& name, std::shared_ptr node) { + auto it = nodes_.find(name); + + if (it != nodes_.end()) { + LOG(FATAL) << "[APU] Node" << name << " is redefined."; + return -1; + } else { + VLOG(3) << " Add: " << name << " : " << node->index(); + auto ret = nodes_.insert( + std::make_pair(name, std::vector>())); + CHECK(ret.second); + it = ret.first; + } + operandIdx_ += 1; + it->second.push_back(node); + + return it->second.size(); +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/apu/bridges/graph.h b/lite/kernels/apu/bridges/graph.h new file mode 100644 index 0000000000000000000000000000000000000000..857800abddbebb411fa607ecbf6a8b2dff702b2b --- /dev/null +++ b/lite/kernels/apu/bridges/graph.h @@ -0,0 +1,113 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "NeuronAdapter.h" +#include "lite/core/op_lite.h" +#include "lite/core/tensor.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +// Graph and node is defined to collect all of converted HiAI IR nodes +class Node { + public: + Node(int32_t operand_idx, std::vector shape) + : idx_(operand_idx), shape_(shape) {} + + void set_shape(std::vector shape) { shape_ = shape; } + + uint32_t index() { return idx_; } + std::vector shape() const { return shape_; } + void set_data(std::shared_ptr data) { data_ = data; } + + private: + int32_t idx_; + std::vector shape_; + std::shared_ptr data_{nullptr}; +}; + +class Graph { + public: + int Add(const std::string& name, std::shared_ptr node); + + // Variable, const or data node + std::shared_ptr Add(const std::string& name, + std::vector shape) { + CHECK(shape.size()) << name << " : " << shape.size(); + auto node = std::make_shared(operandIdx_, shape); + auto idx = Add(name, node); + CHECK_GE(idx, 1); + + return node; + } + + void set_model(NeuronModel* model) { model_ = model; } + NeuronModel* model() { return model_; } + + void set_libHandle(void* libHandle) { libHandle_ = libHandle; } + void* libHandle() { return libHandle_; } + + void set_input_names(const std::vector input_names) { + input_names_ = input_names; + } + + bool IsInput(const std::string& name) { + for (int i = 0; i < input_names_.size(); i++) { + if (input_names_[i] == name) return true; + } + return false; + } + + bool IsOutput(const std::string& name) { + for (int i = 0; i < output_names_.size(); i++) { + if (output_names_[i] == name) return true; + } + return false; + } + + void set_output_names(const std::vector output_names) { + output_names_ = output_names; + } + + std::shared_ptr Get(std::string name) { + CHECK(Has(name)) << "[APU] Node " << name << " not found."; + return nodes_.at(name).back(); + } + + bool Has(const std::string& name) { + return nodes_.find(name) != nodes_.end(); + } + + private: + void* libHandle_; + NeuronModel* model_; + std::unordered_map>> nodes_; + int32_t operandIdx_ = 0; + std::vector input_names_; + std::vector output_names_; +}; + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/apu/bridges/paddle_use_bridges.h b/lite/kernels/apu/bridges/paddle_use_bridges.h new file mode 100644 index 0000000000000000000000000000000000000000..e3e68afc6c7c18d2b8d68361ac09de2abf2b684c --- /dev/null +++ b/lite/kernels/apu/bridges/paddle_use_bridges.h @@ -0,0 +1,24 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +USE_SUBGRAPH_BRIDGE(relu, kAPU); +USE_SUBGRAPH_BRIDGE(conv2d, kAPU); +USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kAPU); +USE_SUBGRAPH_BRIDGE(elementwise_add, kAPU); +USE_SUBGRAPH_BRIDGE(elementwise_mul, kAPU); +USE_SUBGRAPH_BRIDGE(fc, kAPU); +USE_SUBGRAPH_BRIDGE(pool2d, kAPU); +USE_SUBGRAPH_BRIDGE(softmax, kAPU); diff --git a/lite/kernels/apu/bridges/pool_op.cc b/lite/kernels/apu/bridges/pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..5d17ba7a433f5367328f3826d815c65bd75a6f9a --- /dev/null +++ b/lite/kernels/apu/bridges/pool_op.cc @@ -0,0 +1,279 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/operators/pool_op.h" +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting [" + op_type + "] "; + + auto libHandle = graph->libHandle(); + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand) + LOAD_FUNCTIONS( + libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue) + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation) + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + auto out_name = op_info->Output("Out").front(); + auto out = scope->FindMutableTensor(out_name); + auto out_dims = out->dims(); + auto pooling_type = op_info->GetAttr("pooling_type"); + auto global_pooling = op_info->GetAttr("global_pooling"); + auto ksize = op_info->GetAttr>("ksize"); + auto paddings = op_info->GetAttr>("paddings"); + + // pool mode + if ((pooling_type == "max") || (pooling_type == "avg")) { + } else { + LOG(WARNING) << "[APU] Unsupported pooling type: " << pooling_type; + return FAILED; + } + + // pad mode + int pad_mode = 0; + std::string padding_algorithm(""); + if (op_info->HasAttr("padding_algorithm")) { + padding_algorithm = op_info->GetAttr("padding_algorithm"); + } + if (padding_algorithm == "SAME") { + pad_mode = 6; + } else if (padding_algorithm == "VALID") { + pad_mode = 5; + } + + // paddings and strides + if (paddings.size() == 2L) { + for (size_t i = 0; i < 2L; ++i) { + int copy_pad = *(paddings.begin() + 2 * i); + paddings.insert(paddings.begin() + 2 * i + 1, copy_pad); + } + } + CHECK_EQ(paddings.size(), 4L) + << "[APU] Paddings size should be the same or twice as the inputs size."; + + bool adaptive = false; + if (op_info->HasAttr("adaptive")) { + adaptive = op_info->GetAttr("adaptive"); + } + auto strides = op_info->GetAttr>("strides"); + lite::operators::UpdatePadding(&paddings, + global_pooling, + adaptive, + padding_algorithm, + x->dims(), + strides, + ksize); + + // Add x tensor type + float x_scale = 1.0f; + float out_scale = 1.0f; + if (op_info->HasAttr("enable_int8")) { + if (op_info->GetAttr("enable_int8")) { + if (op_info->HasAttr("input_scale")) + x_scale = op_info->GetAttr("input_scale"); + if (op_info->HasAttr("output_scale")) + out_scale = op_info->GetAttr("output_scale"); + } else { + LOG(WARNING) << "Do not enable_int8"; + return FAILED; + } + } else { + LOG(WARNING) << "Do not enable_int8"; + return FAILED; + } + + NeuronOperandType xType; + xType.type = NEURON_TENSOR_QUANT8_ASYMM; + xType.scale = x_scale; + xType.zeroPoint = 128; + xType.dimensionCount = x_dims.size(); + std::vector dims_x = {(uint32_t)x_dims[0], + (uint32_t)x_dims[2], + (uint32_t)x_dims[3], + (uint32_t)x_dims[1]}; + xType.dimensions = &dims_x[0]; + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + LOG(INFO) << "Graph has " << x_name; + // input operand already exist + x_node = graph->Get(x_name); + } else { + // add input operand + (*neuron_model_addOperand)(model, &xType); // 0: x + x_node = graph->Add(x_name, dims_x); + } + VLOG(3) << "x_scale: " << x_scale << ", xType: " << xType.dimensions[0] << ":" + << xType.dimensions[1] << ":" << xType.dimensions[2] << ":" + << xType.dimensions[3]; + + NeuronOperandType int32Type; + int32Type.type = NEURON_INT32; + int32Type.dimensionCount = 0; + std::vector dims_int32 = {0}; + + std::shared_ptr paddingL_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 1: padding left + paddingL_node = graph->Add(x_name + "_padding_left", dims_int32); + + std::shared_ptr paddingR_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 2: padding right + paddingR_node = graph->Add(x_name + "_padding_right", dims_int32); + + std::shared_ptr paddingT_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 3: padding top + paddingT_node = graph->Add(x_name + "_padding_top", dims_int32); + + std::shared_ptr paddingB_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 4: padding bottom + paddingB_node = graph->Add(x_name + "_padding_bottom", dims_int32); + + std::shared_ptr strideW_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 5: stride width + strideW_node = graph->Add(x_name + "_stride_width", dims_int32); + + std::shared_ptr strideH_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 6: stride height + strideH_node = graph->Add(x_name + "_stride_height", dims_int32); + + std::shared_ptr filterW_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 7: filter width + filterW_node = graph->Add(x_name + "_filter_width", dims_int32); + + std::shared_ptr filterH_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 8: filter height + filterH_node = graph->Add(x_name + "_filter_height", dims_int32); + + std::shared_ptr fuse_node = nullptr; + (*neuron_model_addOperand)(model, &int32Type); // 9: fuse + fuse_node = graph->Add(x_name + "_fuse", dims_int32); + + // Add out type + // Add output tensor type + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = out_scale; + outType.zeroPoint = 128; + outType.dimensionCount = out_dims.size(); + std::vector dims_out = {(uint32_t)out_dims[0], + (uint32_t)out_dims[2], + (uint32_t)out_dims[3], + (uint32_t)out_dims[1]}; + outType.dimensions = &dims_out[0]; + std::shared_ptr out_node = nullptr; + if (graph->Has(out_name)) { + out_node = graph->Get(out_name); + } else { + (*neuron_model_addOperand)(model, &outType); // out + out_node = graph->Add(out_name, dims_out); + } + VLOG(3) << "output_scale: " << x_scale + << ", outType: " << outType.dimensions[0] << ":" + << outType.dimensions[1] << ":" << outType.dimensions[2] << ":" + << outType.dimensions[3]; + + // Add padding value + int32_t padding_val[1]; + padding_val[0] = paddings[2]; + (*neuron_model_setOperandValue)( + model, paddingL_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[3]; + (*neuron_model_setOperandValue)( + model, paddingR_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[0]; + (*neuron_model_setOperandValue)( + model, paddingT_node->index(), padding_val, sizeof(int32_t) * 1); + padding_val[0] = paddings[1]; + (*neuron_model_setOperandValue)( + model, paddingB_node->index(), padding_val, sizeof(int32_t) * 1); + + // Add Stride + int32_t stride_val[1]; + stride_val[0] = strides[1]; // width + (*neuron_model_setOperandValue)( + model, strideW_node->index(), stride_val, sizeof(int32_t) * 1); + stride_val[0] = strides[0]; // height + (*neuron_model_setOperandValue)( + model, strideH_node->index(), stride_val, sizeof(int32_t) * 1); + + // Add filter + int32_t filter_val[1]; + filter_val[0] = global_pooling ? x_dims[3] : ksize[1]; // width + (*neuron_model_setOperandValue)( + model, filterW_node->index(), filter_val, sizeof(int32_t) * 1); + filter_val[0] = global_pooling ? x_dims[2] : ksize[0]; // height + (*neuron_model_setOperandValue)( + model, filterH_node->index(), filter_val, sizeof(int32_t) * 1); + + // Add fuse + int32_t fuse_val[1] = {0}; + (*neuron_model_setOperandValue)( + model, fuse_node->index(), fuse_val, sizeof(int32_t) * 1); + + std::vector addInIndex = {x_node->index(), + paddingL_node->index(), + paddingR_node->index(), + paddingT_node->index(), + paddingB_node->index(), + strideW_node->index(), + strideH_node->index(), + filterW_node->index(), + filterH_node->index(), + fuse_node->index()}; + std::vector addOutIndex = {out_node->index()}; + + int neuron_errCode; + if (pooling_type == "max") { + neuron_errCode = (*neuron_model_addOperation)(model, + NEURON_MAX_POOL_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } else { + neuron_errCode = (*neuron_model_addOperation)(model, + NEURON_AVERAGE_POOL_2D, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(pool2d, + kAPU, + paddle::lite::subgraph::apu::PoolConverter); diff --git a/lite/kernels/apu/bridges/softmax_op.cc b/lite/kernels/apu/bridges/softmax_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..59fa8fdfe32c85bfaea5825c82b4752632fd8bed --- /dev/null +++ b/lite/kernels/apu/bridges/softmax_op.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/utility.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +int SoftmaxConverter(void* ctx, OpLite* op, KernelBase* kernel) { + CHECK(ctx != nullptr); + CHECK(op != nullptr); + auto graph = static_cast(ctx); + auto model = graph->model(); + auto op_info = op->op_info(); + auto op_type = op_info->Type(); + auto scope = op->scope(); + VLOG(3) << "[APU] Converting [" + op_type + "]"; + + auto libHandle = graph->libHandle(); + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand) + LOAD_FUNCTIONS( + libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue) + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation) + + // Get input and output vars and op attributes + auto x_name = op_info->Input("X").front(); + auto x = scope->FindMutableTensor(x_name); + auto x_dims = x->dims(); + CHECK_GE(x_dims.size(), 2UL); + auto x_rank = x_dims.size(); + auto out_name = op_info->Output("Out").front(); + + // Check output shape + auto axis = op_info->GetAttr("axis"); + if (axis < 0) { + axis += x_rank; + } + + float input_scale = 1.0f; + float out_scale = 1.0f; + if (op_info->HasAttr("enable_int8")) { + if (op_info->GetAttr("enable_int8")) { + if (op_info->HasAttr("input_scale")) + input_scale = op_info->GetAttr("input_scale"); + if (op_info->HasAttr("output_scale")) + out_scale = op_info->GetAttr("output_scale"); + } else { + LOG(WARNING) << "Do not enable_int8"; + return FAILED; + } + } else { + LOG(WARNING) << "Do not enable_int8"; + return FAILED; + } + + // Check output scale + NeuronOperandType xType; + xType.type = NEURON_TENSOR_QUANT8_ASYMM; + xType.scale = input_scale; + xType.zeroPoint = 128; + xType.dimensionCount = x_dims.size(); + std::vector dims_x; + for (int i = 0; i < x_dims.size(); i++) dims_x.push_back(x_dims[i]); + xType.dimensions = &dims_x[0]; + std::shared_ptr x_node = nullptr; + if (graph->Has(x_name)) { + // input operand already exist + x_node = graph->Get(x_name); + VLOG(3) << "Graph has " << x_name << ",index: " << x_node->index(); + } else { + // add input operand + (*neuron_model_addOperand)(model, &xType); // 0: input + x_node = graph->Add(x_name, dims_x); + } + VLOG(3) << "input_scale size: " << input_scale + << " ,x_dims size: " << x_dims.size() << " ,x_dims: " << x_dims; + + // Add beta operand + std::vector dims_int32 = {0}; + NeuronOperandType betaType; + betaType.type = NEURON_FLOAT32; + betaType.dimensionCount = 0; + (*neuron_model_addOperand)(model, &betaType); // 1: beta + std::shared_ptr beta_node = nullptr; + beta_node = graph->Add(x_name + "_beta", dims_int32); + + // Add axis operand + NeuronOperandType axisType; + axisType.type = NEURON_INT32; + axisType.dimensionCount = 0; + (*neuron_model_addOperand)(model, &axisType); // 2: axis + std::shared_ptr axis_node = nullptr; + axis_node = graph->Add(x_name + "_axis", dims_int32); + + // Add out operand + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = out_scale / 127; + outType.zeroPoint = 128; + outType.dimensionCount = x_dims.size(); + outType.dimensions = &dims_x[0]; + (*neuron_model_addOperand)(model, &outType); // 3: output + std::shared_ptr out_node = nullptr; + out_node = graph->Add(out_name, dims_x); + VLOG(3) << "output_scale: " << out_scale; + + float beta_val[] = {1.0f}; + (*neuron_model_setOperandValue)( + model, beta_node->index(), beta_val, sizeof(float) * 1); + + int32_t axis_val[1]; + axis_val[0] = axis; + (*neuron_model_setOperandValue)( + model, axis_node->index(), axis_val, sizeof(int32_t) * 1); + std::vector addInIndex = { + x_node->index(), beta_node->index(), axis_node->index()}; + std::vector addOutIndex = {out_node->index()}; + int neuron_errCode = (*neuron_model_addOperation)(model, + NEURON_SOFTMAX, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Add op fail:" << op_type; + return FAILED; + } + + return REBUILD_WHEN_SHAPE_CHANGED; +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle + +REGISTER_SUBGRAPH_BRIDGE(softmax, + kAPU, + paddle::lite::subgraph::apu::SoftmaxConverter); diff --git a/lite/kernels/apu/bridges/utility.cc b/lite/kernels/apu/bridges/utility.cc new file mode 100644 index 0000000000000000000000000000000000000000..eab4d008e57b152e25a131a553fc7cee4f1d7e39 --- /dev/null +++ b/lite/kernels/apu/bridges/utility.cc @@ -0,0 +1,257 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/bridges/utility.h" +#include +#include "lite/kernels/apu/bridges/graph.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +// typedef to the build functions pointer signatures +typedef int (*Neuron_getVersion)(uint32_t* version); +typedef int (*NeuronModel_create)(NeuronModel** model); +typedef void (*NeuronModel_free)(NeuronModel* model); +typedef int (*NeuronModel_finish)(NeuronModel* model); +typedef int (*NeuronModel_addOperand)(NeuronModel* model, + const NeuronOperandType* type); +typedef int (*NeuronModel_setOperandValue)(NeuronModel* model, + int32_t index, + const void* buffer, + size_t length); +typedef int (*NeuronModel_addOperation)(NeuronModel* model, + NeuronOperationType type, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); +typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel* model, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); +typedef int (*NeuronModel_setOperandSymmPerChannelQuantParams)( + NeuronModel* model, + int32_t index, + const NeuronSymmPerChannelQuantParams* channelQuant); +typedef int (*NeuronExecution_create)(NeuronCompilation* compilation, + NeuronExecution** execution); +typedef void (*NeuronExecution_free)(NeuronExecution* execution); +typedef int (*NeuronExecution_setInput)(NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + const void* buffer, + size_t length); +typedef int (*NeuronExecution_setOutput)(NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + void* buffer, + size_t length); +typedef int (*NeuronExecution_compute)(NeuronExecution* execution); + +void* LoadFunc(void* libHandle, const char* name) { + CHECK(libHandle != nullptr); + CHECK(name != nullptr); + void* fn = dlsym(libHandle, name); + if (fn == nullptr) { + LOG(WARNING) << "Unable to open Neuron Runtime function [" << name + << "] Because " << dlerror(); + } + return fn; +} + +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname) { + auto iarg_names = op_info->input_argnames(); + if (std::find(iarg_names.begin(), iarg_names.end(), argname) != + iarg_names.end()) { + auto inputs = op_info->Input(argname); + if (inputs.empty()) { + return false; + } + auto var_name = inputs.front(); + auto var = scope->FindVar(var_name); + return var != nullptr; + } else { + return false; + } +} + +void insert_transpose_node(void* ctx, + const std::string& input_name, + const std::string& output_name, + std::vector input_shape, + std::vector output_shape, + std::vector axis, + float scale, + int32_t zeroPoint) { + int neuron_errCode; + auto graph = static_cast(ctx); + auto model = graph->model(); + auto libHandle = graph->libHandle(); + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperand, neuron_model_addOperand) + LOAD_FUNCTIONS( + libHandle, NeuronModel_setOperandValue, neuron_model_setOperandValue) + LOAD_FUNCTIONS(libHandle, NeuronModel_addOperation, neuron_model_addOperation) + + // Add input + NeuronOperandType inType; + inType.type = NEURON_TENSOR_QUANT8_ASYMM; + inType.scale = scale; + inType.zeroPoint = zeroPoint; + inType.dimensionCount = input_shape.size(); + inType.dimensions = &input_shape[0]; + + std::shared_ptr input_node = nullptr; + if (graph->Has(input_name)) { + VLOG(3) << "Has " << input_name; + input_node = graph->Get(input_name); + } else { + neuron_errCode = (*neuron_model_addOperand)(model, &inType); // input + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Insert transpose op fail!"; + return; + } + VLOG(3) << "Add " << input_name; + input_node = graph->Add(input_name, input_shape); + } + + // Add perm + NeuronOperandType permsType; + permsType.type = NEURON_TENSOR_INT32; + permsType.dimensionCount = 1; + uint32_t dims_perms[1] = {4}; + permsType.dimensions = dims_perms; + + neuron_errCode = (*neuron_model_addOperand)(model, &permsType); // perm + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Insert transpose op fail!"; + return; + } + std::shared_ptr perms_node = nullptr; + perms_node = graph->Add(input_name + "_perms", {4}); + + VLOG(3) << "axis :" << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" + << axis[3]; + // &axis[0], sizeof(int32_t) * axis.size()); + neuron_errCode = (*neuron_model_setOperandValue)( + model, perms_node->index(), &axis[0], sizeof(int32_t) * axis.size()); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Insert transpose op fail!"; + return; + } + + // Add output + NeuronOperandType outType; + outType.type = NEURON_TENSOR_QUANT8_ASYMM; + outType.scale = scale; + outType.zeroPoint = zeroPoint; + outType.dimensionCount = output_shape.size(); + outType.dimensions = &output_shape[0]; + + (*neuron_model_addOperand)(model, &outType); // output + std::shared_ptr output_node = nullptr; + output_node = graph->Add(output_name, output_shape); + + std::vector addInIndex = {input_node->index(), // 0: input + perms_node->index()}; // 1: perm + + std::vector addOutIndex = {output_node->index()}; + + neuron_errCode = (*neuron_model_addOperation)(model, + NEURON_TRANSPOSE, + addInIndex.size(), + &addInIndex[0], + addOutIndex.size(), + &addOutIndex[0]); + + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Insert transpose op fail!"; + } +} + +void transpose(const int8_t* input_data, + uint8_t* output_data, + std::vector input_shape, + std::vector axis) { + int old_index = -1; + int new_index = -1; + int dim[4] = {0}; + std::vector shape = input_shape; + VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] + << ":" << input_shape[3]; + VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; + for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { + for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { + for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { + for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) { + old_index = dim[0] * shape[1] * shape[2] * shape[3] + + dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3]; + new_index = + dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; + + output_data[new_index] = input_data[old_index]; + } + } + } + } +} + +void transposeAsym(const int8_t* input_data, + uint8_t* output_data, + std::vector input_shape, + std::vector axis) { + int old_index = -1; + int new_index = -1; + int dim[4] = {0}; + std::vector shape = input_shape; + VLOG(3) << input_shape[0] << ":" << input_shape[1] << ":" << input_shape[2] + << ":" << input_shape[3]; + VLOG(3) << axis[0] << ":" << axis[1] << ":" << axis[2] << ":" << axis[3]; + for (dim[0] = 0; dim[0] < input_shape[0]; dim[0]++) { + for (dim[1] = 0; dim[1] < input_shape[1]; dim[1]++) { + for (dim[2] = 0; dim[2] < input_shape[2]; dim[2]++) { + for (dim[3] = 0; dim[3] < input_shape[3]; dim[3]++) { + old_index = dim[0] * shape[1] * shape[2] * shape[3] + + dim[1] * shape[2] * shape[3] + dim[2] * shape[3] + dim[3]; + new_index = + dim[axis[0]] * shape[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[1]] * shape[axis[2]] * shape[axis[3]] + + dim[axis[2]] * shape[axis[3]] + dim[axis[3]]; + + output_data[new_index] = input_data[old_index] + 128; // per layer + } + } + } + } +} + +void float2int32(const float* bias_data, + float input_scale, + std::vector weight_scale, + int32_t* int32_bias_data) { + for (int i = 0; i < weight_scale.size(); i++) { + int32_bias_data[i] = bias_data[i] / (input_scale * weight_scale[i]); + } +} + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/apu/bridges/utility.h b/lite/kernels/apu/bridges/utility.h new file mode 100644 index 0000000000000000000000000000000000000000..da3f3cd1835a85f3f9d8f4aa3288bd9eebb39ad8 --- /dev/null +++ b/lite/kernels/apu/bridges/utility.h @@ -0,0 +1,111 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include "NeuronAdapter.h" +#include "lite/core/op_lite.h" +#include "lite/utils/macros.h" + +namespace paddle { +namespace lite { +namespace subgraph { +namespace apu { + +// typedef to the build functions pointer signatures +typedef int (*Neuron_getVersion)(uint32_t* version); +typedef int (*NeuronModel_create)(NeuronModel** model); +typedef void (*NeuronModel_free)(NeuronModel* model); +typedef int (*NeuronModel_finish)(NeuronModel* model); +typedef int (*NeuronModel_addOperand)(NeuronModel* model, + const NeuronOperandType* type); +typedef int (*NeuronModel_setOperandValue)(NeuronModel* model, + int32_t index, + const void* buffer, + size_t length); +typedef int (*NeuronModel_addOperation)(NeuronModel* model, + NeuronOperationType type, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); +typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel* model, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); +typedef int (*NeuronModel_setOperandSymmPerChannelQuantParams)( + NeuronModel* model, + int32_t index, + const NeuronSymmPerChannelQuantParams* channelQuant); +typedef int (*NeuronExecution_create)(NeuronCompilation* compilation, + NeuronExecution** execution); +typedef void (*NeuronExecution_free)(NeuronExecution* execution); +typedef int (*NeuronExecution_setInput)(NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + const void* buffer, + size_t length); +typedef int (*NeuronExecution_setOutput)(NeuronExecution* execution, + int32_t index, + const NeuronOperandType* type, + void* buffer, + size_t length); +typedef int (*NeuronExecution_compute)(NeuronExecution* execution); + +void* LoadFunc(void* libHandle, const char* name); + +#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \ + FUNC_NAME VARIABLE_NAME = \ + reinterpret_cast(LoadFunc(libHandle, #FUNC_NAME)); + +// Type/tensor converters for converting Paddle type/tensor to HiAI type/tensor +bool HasInputArg(const OpInfo* op_info, + const Scope* scope, + const std::string& argname); + +void insert_transpose_node(void* ctx, + const std::string& input_name, + const std::string& output_name, + std::vector input_shape, + std::vector output_shape, + std::vector axis, + float scale, + int32_t zeroPoint); + +void transpose(const int8_t* input_data, + uint8_t* output_data, + std::vector input_shape, + std::vector axis); + +void transposeAsym(const int8_t* input_data, + uint8_t* output_data, + std::vector input_shape, + std::vector axis); + +void float2int32(const float* bias_data, + float input_scale, + std::vector weight_scale, + int32_t* int32_bias_data); + +} // namespace apu +} // namespace subgraph +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/apu/subgraph_compute.cc b/lite/kernels/apu/subgraph_compute.cc new file mode 100644 index 0000000000000000000000000000000000000000..6a88b7f8c84fa3daec403373acee69dd84d60498 --- /dev/null +++ b/lite/kernels/apu/subgraph_compute.cc @@ -0,0 +1,297 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "lite/kernels/apu/subgraph_compute.h" +#include +#include +#include +#include +#include "lite/backends/apu/device.h" +#include "lite/core/op_registry.h" +#include "lite/kernels/apu/bridges/graph.h" +#include "lite/kernels/apu/bridges/paddle_use_bridges.h" +#include "lite/kernels/apu/bridges/utility.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace apu { + +inline void* LoadFunc(void* libHandle, const char* name) { + CHECK(libHandle != nullptr); + CHECK(name != nullptr); + void* fn = dlsym(libHandle, name); + if (fn == nullptr) { + LOG(WARNING) << "Unable to open Neuron Runtime function [" << name + << "] Because " << dlerror(); + } + return fn; +} + +#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \ + FUNC_NAME VARIABLE_NAME = \ + reinterpret_cast(LoadFunc(libHandle, #FUNC_NAME)); + +int SubgraphEngine::BuildDeviceProgram() { + typedef int (*Neuron_getVersion)(uint32_t * version); + typedef int (*NeuronModel_create)(NeuronModel * *model); + typedef void (*NeuronModel_free)(NeuronModel * model); + typedef int (*NeuronModel_finish)(NeuronModel * model); + typedef int (*NeuronModel_identifyInputsAndOutputs)(NeuronModel * model, + uint32_t inputCount, + const uint32_t* inputs, + uint32_t outputCount, + const uint32_t* outputs); + + // Open the share library + libHandle_ = dlopen("libneuron_adapter.so", RTLD_LAZY); + if (libHandle_ == nullptr) { + LOG(WARNING) << "Failed to open libneuron_adapter.so. " << dlerror(); + return subgraph::FAILED; + } + + LOAD_FUNCTIONS(libHandle_, Neuron_getVersion, neuron_getVersion) + LOAD_FUNCTIONS(libHandle_, NeuronModel_create, neuron_model_create) + LOAD_FUNCTIONS(libHandle_, NeuronModel_finish, neuron_model_finish) + LOAD_FUNCTIONS(libHandle_, + NeuronModel_identifyInputsAndOutputs, + neuron_model_identifyInputsAndOutputs) + + unsigned int version; + (*neuron_getVersion)(&version); + VLOG(3) << "Neuron Adapter version: " << version; + + int status = 0; + subgraph::apu::Graph graph; + int neuron_errCode = (*neuron_model_create)(&model_); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Fail to create model"; + return subgraph::FAILED; + } + graph.set_libHandle(libHandle_); + graph.set_model(model_); + graph.set_input_names(input_names_); + graph.set_output_names(output_names_); + + // Convert all of ops and their input vars and weights and added into the APU + // NIR graph + const auto& bridges = subgraph::Registry::Instance(); + for (auto& inst : origin_program_) { + auto op = const_cast(inst.op()); + CHECK(op); + op->CheckShape(); + op->InferShape(); + std::string op_type = op->op_info()->Type(); + if (!bridges.Exists(op_type, TARGET(kAPU))) { + return subgraph::FAILED; + } + + auto kernel = inst.kernel(); + status |= + bridges.Select(op_type, TARGET(kAPU))(reinterpret_cast(&graph), + const_cast(op), + const_cast(kernel)); + if (subgraph::CHECK_FAILED(status)) { + return subgraph::FAILED; + } + } + + // Get input tensor + std::vector ins; + origin_itensors_.resize(input_names_.size()); + origin_idims_.resize(input_names_.size()); + for (int i = 0; i < input_names_.size(); i++) { + origin_itensors_[i] = scope_->FindMutableTensor(input_names_[i]); + CHECK(origin_itensors_[i]); + origin_idims_[i] = origin_itensors_[i]->dims(); + VLOG(3) << "subgraph input name: " << i << ", " << input_names_[i] << ":" + << origin_idims_[i].production(); + // Get input index + int idx; + if (graph.Has(input_names_[i])) { + ins.push_back(graph.Get(input_names_[i])->index()); + VLOG(3) << "input idx: " << graph.Get(input_names_[i])->index(); + } else { + LOG(WARNING) << "Fail to find input: " << input_names_[i]; + return subgraph::FAILED; + } + } + + // Get output tensor + std::vector outs; + origin_otensors_.resize(output_names_.size()); + origin_odims_.resize(output_names_.size()); + for (int i = 0; i < output_names_.size(); i++) { + origin_otensors_[i] = scope_->FindMutableTensor(output_names_[i]); + CHECK(origin_otensors_[i]); + origin_odims_[i] = origin_otensors_[i]->dims(); + VLOG(3) << "subgraph output name: " << i << ", " << output_names_[i] << ":" + << origin_odims_[i].production(); + origin_otensors_[i]->mutable_data(); + // Get input index + if (graph.Has(output_names_[i])) { + outs.push_back(graph.Get(output_names_[i])->index()); + VLOG(3) << "output idx: " << graph.Get(output_names_[i])->index(); + } else { + LOG(WARNING) << "Fail to find output: " << output_names_[i]; + return subgraph::FAILED; + } + } + + VLOG(3) << "ins size: " << ins.size() << " outs size:" << outs.size(); + // Set subgraph input/output + (*neuron_model_identifyInputsAndOutputs)( + model_, ins.size(), &ins[0], outs.size(), &outs[0]); + neuron_errCode = (*neuron_model_finish)(model_); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Fail to create NIR model:" << neuron_errCode; + return subgraph::FAILED; + } + VLOG(3) << "[APU] APU NIR model created!"; + + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + auto start_time = GetCurrentUS(); + compilation_ = lite::apu::Device::Global().Build(libHandle_, model_); + if (compilation_ == nullptr) { + LOG(WARNING) << "[APU] Build APU DLA model failed!"; + return subgraph::FAILED; + } + VLOG(3) << "[APU] APU DLA model created, Build cost " + << GetCurrentUS() - start_time << " us"; + + return status; +} + +int SubgraphEngine::LaunchDeviceProgram() { + typedef int (*NeuronExecution_create)(NeuronCompilation * compilation, + NeuronExecution * *execution); + typedef void (*NeuronExecution_free)(NeuronExecution * execution); + typedef int (*NeuronExecution_setInput)(NeuronExecution * execution, + int32_t index, + const NeuronOperandType* type, + const void* buffer, + size_t length); + typedef int (*NeuronExecution_setOutput)(NeuronExecution * execution, + int32_t index, + const NeuronOperandType* type, + void* buffer, + size_t length); + typedef int (*NeuronExecution_compute)(NeuronExecution * execution); + + LOAD_FUNCTIONS(libHandle_, NeuronExecution_create, neuron_execution_create) + LOAD_FUNCTIONS(libHandle_, NeuronExecution_free, neuron_execution_free) + LOAD_FUNCTIONS( + libHandle_, NeuronExecution_setInput, neuron_execution_setInput) + LOAD_FUNCTIONS( + libHandle_, NeuronExecution_setOutput, neuron_execution_setOutput) + LOAD_FUNCTIONS(libHandle_, NeuronExecution_compute, neuron_execution_compute) + + NeuronExecution* run1 = NULL; + auto GetCurrentUS = []() -> double { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; + }; + + auto start_time = GetCurrentUS(); + int neuron_errCode = (*neuron_execution_create)(compilation_, &run1); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "[APU] Build APU runtime failed!"; + return subgraph::FAILED; + } + + // Set input buffer + Tensor input_temp; + for (size_t i = 0; i < origin_itensors_.size(); i++) { + input_temp.Resize({origin_idims_[i]}); + uint8_t* input_data = input_temp.mutable_data(); + memcpy(input_data, + origin_itensors_[i]->raw_data(), + origin_itensors_[i]->memory_size()); + for (int j = 0; j < origin_itensors_[i]->data_size(); j++) { + input_data[j] += (uint8_t)128; + } + (*neuron_execution_setInput)( + run1, i, NULL, input_data, origin_itensors_[i]->memory_size()); + } + + // Set output buffer + for (size_t i = 0; i < origin_otensors_.size(); i++) { + (*neuron_execution_setOutput)( + run1, + i, + NULL, + reinterpret_cast(origin_otensors_[i]->raw_data()), + origin_otensors_[i]->memory_size()); + } + + neuron_errCode = (*neuron_execution_compute)(run1); + if (NEURON_NO_ERROR != neuron_errCode) { + LOG(WARNING) << "Fail to run execution!" << neuron_errCode; + return subgraph::FAILED; + } + + for (size_t i = 0; i < origin_otensors_.size(); i++) { + int8_t* output_data = origin_otensors_[i]->mutable_data(); + VLOG(3) << "output size:" << origin_otensors_[i]->memory_size(); + for (int j = 0; j < origin_otensors_[i]->data_size(); j++) { + output_data[j] -= (int8_t)128; + } + } + (*neuron_execution_free)(run1); + VLOG(3) << "[APU] Process cost " << GetCurrentUS() - start_time << " us"; + return 0; +} + +void SubgraphCompute::PrepareForRun() { + auto& param = this->Param(); + engine_.reset(new SubgraphEngine(ctx_.get(), + param.sub_block_idx, + param.sub_block_desc, + param.input_data_names, + param.output_data_names, + param.scope)); + CHECK(engine_); + engine_->Build(); +} + +void SubgraphCompute::Run() { + CHECK(engine_); + engine_->Launch(); +} + +} // namespace apu +} // namespace kernels +} // namespace lite +} // namespace paddle + +REGISTER_LITE_KERNEL(subgraph, + kAPU, + kInt8, + kNCHW, + paddle::lite::kernels::apu::SubgraphCompute, + def) + .BindInput("Inputs", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .BindOutput("Outputs", + {LiteType::GetTensorTy(TARGET(kHost), + PRECISION(kInt8), + DATALAYOUT(kNCHW))}) + .Finalize(); diff --git a/lite/kernels/apu/subgraph_compute.h b/lite/kernels/apu/subgraph_compute.h new file mode 100644 index 0000000000000000000000000000000000000000..cb8743e92914e1fb5752ae930da83ec9761c83a5 --- /dev/null +++ b/lite/kernels/apu/subgraph_compute.h @@ -0,0 +1,69 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "NeuronAdapter.h" +#include "lite/core/kernel.h" +#include "lite/kernels/npu/bridges/engine.h" +#include "lite/kernels/npu/bridges/registry.h" + +namespace paddle { +namespace lite { +namespace kernels { +namespace apu { + +class SubgraphEngine : public subgraph::Engine { + public: + SubgraphEngine(KernelContext *ctx, + int block_idx, + cpp::BlockDesc *block_desc, + const std::vector &input_names, + const std::vector &output_names, + Scope *scope) + : subgraph::Engine( + ctx, block_idx, block_desc, input_names, output_names, scope) {} + + protected: + int BuildDeviceProgram() override; + int LaunchDeviceProgram() override; + + std::string model_name_; + void *libHandle_; + NeuronModel *model_; + NeuronCompilation *compilation_; +}; + +class SubgraphCompute + : public KernelLite { + public: + using param_t = operators::SubgraphParam; + + void PrepareForRun() override; + + void Run() override; + + virtual ~SubgraphCompute() = default; + + private: + std::unique_ptr engine_; +}; + +} // namespace apu +} // namespace kernels +} // namespace lite +} // namespace paddle diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt index a3b1c3680e283a4425fe22209c443ce7cd958267..aa3a52e8ad1223451de06e820da7e1febb43b879 100644 --- a/lite/kernels/arm/CMakeLists.txt +++ b/lite/kernels/arm/CMakeLists.txt @@ -63,7 +63,6 @@ add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -92,7 +91,6 @@ add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_ add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm) @@ -101,7 +99,6 @@ add_kernel(beam_search_compute_arm ARM extra SRCS beam_search_compute.cc DEPS ${ add_kernel(fill_constant_compute_arm ARM basic SRCS fill_constant_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(fill_constant_batch_size_like_compute_arm ARM basic SRCS fill_constant_batch_size_like_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lod_reset_compute_arm ARM extra SRCS lod_reset_compute.cc DEPS ${lite_kernel_deps} math_arm) -add_kernel(is_empty_compute_arm ARM extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps} math_arm) add_kernel(lstm_arm ARM extra SRCS lstm_compute.cc DEPS ${lite_kernel_deps} math_arm) # 4. training kernels diff --git a/lite/kernels/arm/activation_compute.cc b/lite/kernels/arm/activation_compute.cc index ea60cf528ea71f0bc0ba0a162063bd76899622f9..085e914c6e05c26d3031a4cfdac3c39d31f40f6d 100644 --- a/lite/kernels/arm/activation_compute.cc +++ b/lite/kernels/arm/activation_compute.cc @@ -207,6 +207,16 @@ void ReciprocalCompute::Run() { x_data, output_data, x_dims.production(), ctx.threads()); } +void AbsCompute::Run() { + auto& param = this->Param(); + auto& ctx = this->ctx_->template As(); + auto x_dims = param.X->dims(); + auto x_data = param.X->data(); + auto output_data = param.Out->mutable_data(); + lite::arm::math::act_abs( + x_data, output_data, x_dims.production(), ctx.threads()); +} + } // namespace arm } // namespace kernels } // namespace lite @@ -321,3 +331,8 @@ REGISTER_LITE_KERNEL(reciprocal, .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) .Finalize(); +REGISTER_LITE_KERNEL( + abs, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AbsCompute, def) + .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))}) + .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))}) + .Finalize(); diff --git a/lite/kernels/arm/activation_compute.h b/lite/kernels/arm/activation_compute.h index 2e8deda786a1ea9af70499c7b33c8aa1c6e19370..2e9774637b7a9156197ffeff5f4bca13a20620bb 100644 --- a/lite/kernels/arm/activation_compute.h +++ b/lite/kernels/arm/activation_compute.h @@ -166,6 +166,15 @@ class ReciprocalCompute : public KernelLite { virtual ~ReciprocalCompute() = default; }; +class AbsCompute : public KernelLite { + public: + using param_t = operators::ActivationParam; + + void Run() override; + + virtual ~AbsCompute() = default; +}; + } // namespace arm } // namespace kernels } // namespace lite diff --git a/lite/kernels/arm/compare_compute.cc b/lite/kernels/arm/compare_compute.cc deleted file mode 100644 index 709942a0d9f385e4ba55be32657633c0edc378cf..0000000000000000000000000000000000000000 --- a/lite/kernels/arm/compare_compute.cc +++ /dev/null @@ -1,295 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "lite/kernels/arm/compare_compute.h" -#include -#include "lite/api/paddle_place.h" -#include "lite/backends/arm/math/funcs.h" -#include "lite/core/op_registry.h" -#include "lite/core/type_system.h" - -namespace paddle { -namespace lite { -namespace kernels { -namespace arm { - -#define COMPARE_FUNCTOR(name, op) \ - template \ - struct _##name##Functor { \ - inline bool operator()(const T &a, const T &b) const { return a op b; } \ - }; - -COMPARE_FUNCTOR(Equal, ==); -COMPARE_FUNCTOR(NotEqual, !=); -COMPARE_FUNCTOR(LessThan, <); -COMPARE_FUNCTOR(LessEqual, <=); -COMPARE_FUNCTOR(GreaterThan, >); -COMPARE_FUNCTOR(GreaterEqual, >=); - -template <> -struct _EqualFunctor { - inline bool operator()(const float &a, const float &b) const { - // It is safe to cast a and b to double. - return fabs(static_cast(a - b)) < 1e-8; - } -}; - -template <> -struct _NotEqualFunctor { - inline bool operator()(const float &a, const float &b) const { - return !_EqualFunctor()(a, b); - } -}; - -inline void get_mid_dims(const lite::DDim &x_dims, - const lite::DDim &y_dims, - const int axis, - int *pre, - int *n, - int *post) { - *pre = 1; - *n = 1; - *post = 1; - for (int i = 0; i < axis; ++i) { - (*pre) *= x_dims[i]; - } - - for (int i = 0; i < y_dims.size(); ++i) { - (*n) *= y_dims[i]; - } - - for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) { - (*post) *= x_dims[i]; - } -} - -template