未验证 提交 88d7b311 编写于 作者: S Santa An 提交者: GitHub

Merge branch 'develop' into baolei/bitmain

...@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " ...@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
message(STATUS "AR tools: ${CMAKE_AR}") message(STATUS "AR tools: ${CMAKE_AR}")
if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
if (MSVC_STATIC_CRT)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
endif()
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
add_compile_options(/MP)
message(STATUS "Using parallel compiling (/MP)")
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
find_package(CUDA QUIET) find_package(CUDA QUIET)
endif() endif()
...@@ -59,10 +84,12 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF) ...@@ -59,10 +84,12 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON) lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF) lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF) lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
lite_option(LITE_WITH_APU "Enable APU in lite mode" OFF)
lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
...@@ -105,9 +132,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING ...@@ -105,9 +132,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
if(WIN32)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE)
else()
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE) FORCE)
endif()
endif() endif()
message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
...@@ -129,6 +163,10 @@ if (LITE_WITH_PYTHON) ...@@ -129,6 +163,10 @@ if (LITE_WITH_PYTHON)
include(external/pybind11) # download, build, install pybind11 include(external/pybind11) # download, build, install pybind11
endif() endif()
if(LITE_WITH_RKNPU)
include(device/rknpu)
endif()
# for mobile # for mobile
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
...@@ -136,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -136,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
include(cross_compiling/postproject) include(cross_compiling/postproject)
include(device/npu) # check and prepare NPU DDK include(device/npu) # check and prepare NPU DDK
include(device/xpu) # check and prepare XPU SDK include(device/xpu) # check and prepare XPU SDK
include(device/apu) # check and prepare APU SDK
# We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
# So the following third party dependencies are not needed. # So the following third party dependencies are not needed.
...@@ -185,6 +224,7 @@ endif() ...@@ -185,6 +224,7 @@ endif()
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
include(external/xbyak) # download xbyak package include(external/xbyak) # download xbyak package
include(external/libxsmm) # download, build, install libxsmm include(external/libxsmm) # download, build, install libxsmm
include(external/gflags) # download, build, install gflags include(external/gflags) # download, build, install gflags
include(external/glog) # download, build, install glog include(external/glog) # download, build, install glog
...@@ -209,7 +249,9 @@ include(generic) # simplify cmake module ...@@ -209,7 +249,9 @@ include(generic) # simplify cmake module
include(ccache) # set ccache for compilation include(ccache) # set ccache for compilation
include(util) # set unittest and link libs include(util) # set unittest and link libs
include(version) # set PADDLE_VERSION include(version) # set PADDLE_VERSION
include(flags) if(NOT APPLE)
include(flags)
endif()
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
......
@echo off
setlocal
setlocal enabledelayedexpansion
set source_path=%~dp0
rem global variables
set BUILD_EXTRA=OFF
set BUILD_JAVA=ON
set BUILD_PYTHON=OFF
set BUILD_DIR=%source_path%
set OPTMODEL_DIR=""
set BUILD_TAILOR=OFF
set BUILD_CV=OFF
set SHUTDOWN_LOG=ON
set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
set workspace=%source_path%
:set_vcvarsall_dir
SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat =======>"
set tmp_var=!vcvarsall_dir!
call:remove_space
set vcvarsall_dir=!tmp_var!
IF NOT EXIST "%vcvarsall_dir%" (
echo "------------%vcvarsall_dir% not exist------------"
goto set_vcvarsall_dir
)
call:prepare_thirdparty
if EXIST "%build_directory%" (
call:rm_rebuild_dir "%build_directory%"
md "%build_directory%"
)
set root_dir=%workspace%
set build_directory=%BUILD_DIR%\build.lite.x86
set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code
set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug
rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
rem here we fake an empty file to make cmake works.
if NOT EXIST "%GEN_CODE_PATH_PREFIX%" (
md "%GEN_CODE_PATH_PREFIX%"
)
type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc"
if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" (
md "%DEBUG_TOOL_PATH_PREFIX%"
)
copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\"
cd "%build_directory%"
cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_MKL=ON ^
-DWITH_MKLDNN=OFF ^
-DLITE_WITH_X86=ON ^
-DLITE_WITH_PROFILE=OFF ^
-DWITH_LITE=ON ^
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^
-DLITE_WITH_ARM=OFF ^
-DWITH_GPU=OFF ^
-DLITE_BUILD_EXTRA=ON ^
-DLITE_WITH_PYTHON=ON ^
-DPYTHON_EXECUTABLE="%python_path%"
call "%vcvarsall_dir%" amd64
msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1
goto:eof
:prepare_thirdparty
SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe =======>"
set tmp_var=!python_path!
call:remove_space
set python_path=!tmp_var!
if "!python_path!"=="" (
set python_path=python.exe
) else (
if NOT exist "!python_path!" (
echo "------------!python_path! not exist------------"
goto:eof
)
)
if EXIST "%workspace%\third-party" (
if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists."
) else (
echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
call:rm_rebuild_dir "%workspace%\third-party"
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
)
) else (
if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
call:download_third_party
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
) else (
echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
)
)
git submodule update --init --recursive
goto:eof
:download_third_party
powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^
'%workspace%third-party-05b862.tar.gz')
goto:eof
:rm_rebuild_dir
del /f /s /q "%~1\*.*" >nul 2>&1
rd /s /q "%~1" >nul 2>&1
goto:eof
:remove_space
:remove_left_space
if "%tmp_var:~0,1%"==" " (
set "tmp_var=%tmp_var:~1%"
goto remove_left_space
)
:remove_right_space
if "%tmp_var:~-1%"==" " (
set "tmp_var=%tmp_var:~0,-1%"
goto remove_left_space
)
goto:eof
\ No newline at end of file
...@@ -34,6 +34,15 @@ elseif(SSE3_FOUND) ...@@ -34,6 +34,15 @@ elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG}) set(SIMD_FLAG ${SSE3_FLAG})
endif() endif()
if(WIN32)
# windows header option for all targets.
add_definitions(-D_XKEYCHECK_H)
if (NOT MSVC)
message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
endif(NOT MSVC)
endif(WIN32)
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
add_definitions(-DLITE_WITH_CUDA) add_definitions(-DLITE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
...@@ -70,7 +79,7 @@ endif() ...@@ -70,7 +79,7 @@ endif()
if (WITH_MKLML AND MKLML_IOMP_LIB) if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
if(WIN32) if(WIN32 OR APPLE)
# openmp not support well for now on windows # openmp not support well for now on windows
set(OPENMP_FLAGS "") set(OPENMP_FLAGS "")
else(WIN32) else(WIN32)
...@@ -134,6 +143,14 @@ if (LITE_WITH_NPU) ...@@ -134,6 +143,14 @@ if (LITE_WITH_NPU)
add_definitions("-DLITE_WITH_NPU") add_definitions("-DLITE_WITH_NPU")
endif() endif()
if (LITE_WITH_APU)
add_definitions("-DLITE_WITH_APU")
endif()
if (LITE_WITH_RKNPU)
add_definitions("-DLITE_WITH_RKNPU")
endif()
if (LITE_WITH_XPU) if (LITE_WITH_XPU)
add_definitions("-DLITE_WITH_XPU") add_definitions("-DLITE_WITH_XPU")
if (LITE_WITH_XTCL) if (LITE_WITH_XTCL)
...@@ -181,3 +198,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) ...@@ -181,3 +198,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL") add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL")
endif(LITE_ON_MODEL_OPTIMIZE_TOOL) endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
if (LITE_WITH_PYTHON)
add_definitions("-DLITE_WITH_PYTHON")
endif(LITE_WITH_PYTHON)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_APU)
return()
endif()
if(NOT DEFINED APU_DDK_ROOT)
set(APU_DDK_ROOT $ENV{APU_DDK_ROOT})
if(NOT APU_DDK_ROOT)
message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON")
endif()
endif()
message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}")
find_path(APU_DDK_INC NAMES NeuronAdapter.h
PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH)
if(NOT APU_DDK_INC)
message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include")
endif()
message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")
include_directories("${APU_DDK_ROOT}/include")
set(APU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(APU_SUB_LIB_PATH "lib64")
endif()
find_library(APU_NEURON_FILE NAMES neuron
PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter
PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
if(NOT APU_NEURON_FILE)
message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}")
else()
message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}")
add_library(apu_neuron SHARED IMPORTED GLOBAL)
set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE})
endif()
if(NOT APU_NEURON_ADAPTER_FILE)
message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}")
else()
message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}")
add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL)
set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE})
endif()
set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs")
message(STATUS "${apu_runtime_libs}")
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_RKNPU)
return()
endif()
if(NOT DEFINED RKNPU_DDK_ROOT)
set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
if(NOT RKNPU_DDK_ROOT)
message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
endif()
endif()
message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
PATHS ${RKNPU_DDK_ROOT}/include/ NO_DEFAULT_PATH)
if(NOT RKNPU_DDK_INC)
message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
endif()
include_directories("${RKNPU_DDK_ROOT}/include")
set(RKNPU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(RKNPU_SUB_LIB_PATH "lib64")
endif()
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(RKNPU_SUB_LIB_PATH "lib")
endif()
find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
if(NOT RKNPU_DDK_FILE)
message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
else()
message(STATUS "Found RKNPU_DDK_FILE Library: ${RKNPU_DDK_FILE}")
add_library(rknpu_ddk SHARED IMPORTED GLOBAL)
set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
endif()
set(rknpu_runtime_libs rknpu_ddk CACHE INTERNAL "rknpu ddk runtime libs")
...@@ -36,7 +36,16 @@ else() ...@@ -36,7 +36,16 @@ else()
# eigen on cuda9.1 missing header of math_funtions.hpp # eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG GIT_TAG
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip ######################################################################################################
# url address of eigen before v2.3.0
# URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
######################################################################################################
# url address of eigen since v2.6.0
# github address: https://github.com/eigenteam/eigen-git-mirror
# we changed the source code to adapt for windows compiling
# git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
######################################################################################################
URL https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR} DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1 DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR} PREFIX ${EIGEN_SOURCE_DIR}
......
...@@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML}) ...@@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML})
return() return()
ENDIF(NOT ${WITH_MKLML}) ENDIF(NOT ${WITH_MKLML})
IF(APPLE)
MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
...@@ -38,7 +32,17 @@ IF(WIN32) ...@@ -38,7 +32,17 @@ IF(WIN32)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_LIB_DEPS ${MKLML_LIB_DIR}/msvcr120.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
ELSEIF(APPLE)
#TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml.dylib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml.dylib)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib)
ELSE() ELSE()
#TODO(intel-huying): #TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later. # Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
......
...@@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES}) ...@@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
SET(py_env "") SET(py_env "")
IF(PYTHONINTERP_FOUND) IF(PYTHONINTERP_FOUND)
find_python_module(pip REQUIRED) find_python_module(pip REQUIRED)
find_python_module(numpy REQUIRED) #find_python_module(numpy REQUIRED)
#find_python_module(wheel REQUIRED) #find_python_module(wheel REQUIRED)
#find_python_module(google.protobuf REQUIRED) #find_python_module(google.protobuf REQUIRED)
FIND_PACKAGE(NumPy REQUIRED) #FIND_PACKAGE(NumPy REQUIRED)
#IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0") #IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
# MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, " # MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
# "please use pip to upgrade protobuf. pip install -U protobuf") # "please use pip to upgrade protobuf. pip install -U protobuf")
......
...@@ -276,7 +276,7 @@ function(cc_library TARGET_NAME) ...@@ -276,7 +276,7 @@ function(cc_library TARGET_NAME)
add_dependencies(${TARGET_NAME} mklml) add_dependencies(${TARGET_NAME} mklml)
if(WIN32) if(WIN32)
target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB}) target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
else(WIN32) elseif(NOT APPLE)
target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed") target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif(WIN32) endif(WIN32)
endif() endif()
......
...@@ -22,7 +22,7 @@ endfunction() ...@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET) function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -88,6 +88,18 @@ function (lite_deps TARGET) ...@@ -88,6 +88,18 @@ function (lite_deps TARGET)
endforeach(var) endforeach(var)
endif() endif()
if (LITE_WITH_APU)
foreach(var ${lite_deps_APU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
if (LITE_WITH_RKNPU)
foreach(var ${lite_deps_RKNPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
if (LITE_WITH_XPU) if (LITE_WITH_XPU)
foreach(var ${lite_deps_XPU_DEPS}) foreach(var ${lite_deps_XPU_DEPS})
set(deps ${deps} ${var}) set(deps ${deps} ${var})
...@@ -131,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean ...@@ -131,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET) function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -142,10 +154,12 @@ function(lite_cc_library TARGET) ...@@ -142,10 +154,12 @@ function(lite_cc_library TARGET)
CUDA_DEPS ${args_CUDA_DEPS} CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS} CL_DEPS ${args_CL_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
CV_DEPS ${args_CV_DEPS} CV_DEPS ${args_CV_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
...@@ -161,8 +175,10 @@ function(lite_cc_library TARGET) ...@@ -161,8 +175,10 @@ function(lite_cc_library TARGET)
else() else()
cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
endif() endif()
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
# collect targets need to compile for lite # collect targets need to compile for lite
if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS) if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
add_dependencies(lite_compile_deps ${TARGET}) add_dependencies(lite_compile_deps ${TARGET})
...@@ -177,7 +193,7 @@ function(lite_cc_binary TARGET) ...@@ -177,7 +193,7 @@ function(lite_cc_binary TARGET)
set(options " -g ") set(options " -g ")
endif() endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -190,8 +206,10 @@ function(lite_cc_binary TARGET) ...@@ -190,8 +206,10 @@ function(lite_cc_binary TARGET)
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -199,7 +217,9 @@ function(lite_cc_binary TARGET) ...@@ -199,7 +217,9 @@ function(lite_cc_binary TARGET)
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
) )
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
if (NOT APPLE) if (NOT APPLE)
# strip binary target to reduce size # strip binary target to reduce size
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
...@@ -226,7 +246,7 @@ function(lite_cc_test TARGET) ...@@ -226,7 +246,7 @@ function(lite_cc_test TARGET)
endif() endif()
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS ARGS
COMPILE_LEVEL # (basic|extra) COMPILE_LEVEL # (basic|extra)
...@@ -247,8 +267,10 @@ function(lite_cc_test TARGET) ...@@ -247,8 +267,10 @@ function(lite_cc_test TARGET)
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS} HVY_DEPS ${args_HVY_DEPS}
...@@ -263,7 +285,9 @@ function(lite_cc_test TARGET) ...@@ -263,7 +285,9 @@ function(lite_cc_test TARGET)
"${TARGET}" "${TARGET}"
COMMENT "Strip debug symbols done on final executable file.") COMMENT "Strip debug symbols done on final executable file.")
endif() endif()
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
file(APPEND ${offline_test_registry_file} "${TARGET}\n") file(APPEND ${offline_test_registry_file} "${TARGET}\n")
# collect targets need to compile for lite # collect targets need to compile for lite
...@@ -277,9 +301,11 @@ set(x86_kernels CACHE INTERNAL "x86 kernels") ...@@ -277,9 +301,11 @@ set(x86_kernels CACHE INTERNAL "x86 kernels")
set(cuda_kernels CACHE INTERNAL "cuda kernels") set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels") set(npu_kernels CACHE INTERNAL "npu kernels")
set(apu_kernels CACHE INTERNAL "apu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels") set(bm_kernels CACHE INTERNAL "bm kernels")
set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels") set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels") set(host_kernels CACHE INTERNAL "host kernels")
...@@ -295,12 +321,12 @@ if(LITE_BUILD_TAILOR) ...@@ -295,12 +321,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif() endif()
# add a kernel for some specific device # add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM) # device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra) # level: one of (basic, extra)
function(add_kernel TARGET device level) function(add_kernel TARGET device level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -323,6 +349,12 @@ function(add_kernel TARGET device level) ...@@ -323,6 +349,12 @@ function(add_kernel TARGET device level)
if ("${device}" STREQUAL "Host") if ("${device}" STREQUAL "Host")
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "") set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "ARM") if ("${device}" STREQUAL "ARM")
...@@ -352,6 +384,15 @@ function(add_kernel TARGET device level) ...@@ -352,6 +384,15 @@ function(add_kernel TARGET device level)
endif() endif()
set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "APU")
if (NOT LITE_WITH_APU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "XPU") if ("${device}" STREQUAL "XPU")
if (NOT LITE_WITH_XPU) if (NOT LITE_WITH_XPU)
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
...@@ -379,8 +420,20 @@ function(add_kernel TARGET device level) ...@@ -379,8 +420,20 @@ function(add_kernel TARGET device level)
endif() endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "") set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "RKNPU")
if (NOT LITE_WITH_RKNPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "MLU") if ("${device}" STREQUAL "MLU")
if (NOT LITE_WITH_MLU) if (NOT LITE_WITH_MLU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return() return()
endif() endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "") set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
...@@ -423,8 +476,10 @@ function(add_kernel TARGET device level) ...@@ -423,8 +476,10 @@ function(add_kernel TARGET device level)
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
...@@ -444,7 +499,7 @@ endif() ...@@ -444,7 +499,7 @@ endif()
function(add_operator TARGET level) function(add_operator TARGET level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -477,8 +532,10 @@ function(add_operator TARGET level) ...@@ -477,8 +532,10 @@ function(add_operator TARGET level)
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
...@@ -486,6 +543,29 @@ function(add_operator TARGET level) ...@@ -486,6 +543,29 @@ function(add_operator TARGET level)
) )
endfunction() endfunction()
#only for windows
function(create_static_lib TARGET_NAME)
set(libs ${ARGN})
list(REMOVE_DUPLICATES libs)
set(dummy_index 1)
set(dummy_offset 1)
# the dummy target would be consisted of limit size libraries
set(dummy_limit 60)
list(LENGTH libs libs_len)
foreach(lib ${libs})
list(APPEND dummy_list ${lib})
list(LENGTH dummy_list listlen)
if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
set(dummy_list)
list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
MATH(EXPR dummy_index "${dummy_index}+1")
endif()
MATH(EXPR dummy_offset "${dummy_offset}+1")
endforeach()
merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
endfunction()
# Bundle several static libraries into one. # Bundle several static libraries into one.
function(bundle_static_library tgt_name bundled_tgt_name fake_target) function(bundle_static_library tgt_name bundled_tgt_name fake_target)
...@@ -529,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target) ...@@ -529,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
set(bundled_tgt_full_name set(bundled_tgt_full_name
${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}) ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
#message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}") message(STATUS "bundled_tgt_full_name: ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
if(WIN32)
set(dummy_tgt_name dummy_${bundled_tgt_name})
create_static_lib(${bundled_tgt_name} ${static_libs})
add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name})
add_dependencies(${fake_target} ${tgt_name})
add_library(${dummy_tgt_name} STATIC IMPORTED)
set_target_properties(${dummy_tgt_name}
PROPERTIES
IMPORTED_LOCATION ${bundled_tgt_full_name}
INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
add_dependencies(${dummy_tgt_name} ${fake_target})
return()
endif()
if(NOT IOS) if(NOT IOS)
file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
......
...@@ -7,7 +7,9 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}") ...@@ -7,7 +7,9 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}") message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
...@@ -70,12 +72,18 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -70,12 +72,18 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (LITE_WITH_XPU) if (LITE_WITH_XPU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu") set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu")
endif(LITE_WITH_XPU) endif(LITE_WITH_XPU)
if (LITE_WITH_APU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.apu")
endif(LITE_WITH_APU)
if (LITE_WITH_FPGA) if (LITE_WITH_FPGA)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga") set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
endif(LITE_WITH_FPGA) endif(LITE_WITH_FPGA)
if (LITE_WITH_BM) if (LITE_WITH_BM)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm") set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
endif(LITE_WITH_BM) endif(LITE_WITH_BM)
if (LITE_WITH_RKNPU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
endif(LITE_WITH_RKNPU)
else() else()
set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib") set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
endif() endif()
...@@ -83,16 +91,59 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") ...@@ -83,16 +91,59 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
# add python lib # add python lib
if (LITE_WITH_PYTHON) if (LITE_WITH_PYTHON)
add_custom_target(publish_inference_python_lib ${TARGET} if(WIN32)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd")
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs" set(LITE_CORE_DEPS ${LITE_CORE})
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" add_custom_command(OUTPUT ${LITE_CORE}
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/" COMMAND cmake -E copy $<TARGET_FILE:lite_pybind> ${LITE_CORE}
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite" DEPENDS lite_pybind)
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so" add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS})
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd"
DEPENDS copy_lite_pybind
)
add_custom_target(publish_inference_python_installer ${TARGET}
COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
DEPENDS publish_inference_python_lib)
add_custom_target(publish_inference_python_light_demo ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
)
add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo)
else()
if(APPLE)
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
else()
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
endif()
add_custom_target(publish_inference_python_installer ${TARGET} add_custom_target(publish_inference_python_installer ${TARGET}
COMMAND python setup.py bdist_wheel COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/ WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
DEPENDS publish_inference_python_lib) DEPENDS publish_inference_python_lib)
add_custom_target(publish_inference_python_light_demo ${TARGET} add_custom_target(publish_inference_python_light_demo ${TARGET}
...@@ -108,30 +159,78 @@ if (LITE_WITH_PYTHON) ...@@ -108,30 +159,78 @@ if (LITE_WITH_PYTHON)
add_dependencies(publish_inference publish_inference_python_lib) add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer) add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo) add_dependencies(publish_inference publish_inference_python_light_demo)
endif(WIN32)
endif() endif()
if (LITE_WITH_CUDA OR LITE_WITH_X86) if (LITE_WITH_CUDA OR LITE_WITH_X86)
add_custom_target(publish_inference_cxx_lib ${TARGET} if(APPLE)
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" add_custom_target(publish_inference_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" )
) add_custom_target(publish_inference_third_party ${TARGET}
add_custom_target(publish_inference_third_party ${TARGET} COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party" COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party") add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cxx_lib bundle_full_api) add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference_cxx_lib bundle_light_api) add_dependencies(publish_inference publish_inference_cxx_lib)
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared) add_dependencies(publish_inference publish_inference_third_party)
add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) elseif(NOT WIN32)
add_dependencies(publish_inference publish_inference_cxx_lib) add_custom_target(publish_inference_cxx_lib ${TARGET}
add_dependencies(publish_inference publish_inference_third_party) COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_custom_target(publish_inference_third_party ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cxx_lib)
add_dependencies(publish_inference publish_inference_third_party)
endif()
endif() endif()
if (LITE_WITH_X86) if (LITE_WITH_X86)
if(WIN32)
add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
else()
add_custom_target(publish_inference_x86_cxx_lib ${TARGET} add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
...@@ -146,6 +245,7 @@ if (LITE_WITH_X86) ...@@ -146,6 +245,7 @@ if (LITE_WITH_X86)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
add_dependencies(publish_inference publish_inference_x86_cxx_lib) add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_dependencies(publish_inference publish_inference_x86_cxx_demos) add_dependencies(publish_inference publish_inference_x86_cxx_demos)
endif()
endif() endif()
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
......
...@@ -23,6 +23,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ...@@ -23,6 +23,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
add_dependencies(paddle_full_api_shared dynload_mklml) add_dependencies(paddle_full_api_shared dynload_mklml)
endif() endif()
if(WIN32)
target_link_libraries(paddle_full_api_shared shlwapi.lib)
endif()
endif() endif()
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
...@@ -34,15 +37,20 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ...@@ -34,15 +37,20 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels}
) )
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") if(NOT APPLE AND NOT WIN32)
set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE}) add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS}) add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
add_dependencies(paddle_full_api_shared custom_linker_map) set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
add_dependencies(paddle_full_api_shared custom_linker_map)
endif()
else() else()
if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux")) if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
add_library(paddle_light_api_shared SHARED "") add_library(paddle_light_api_shared SHARED "")
...@@ -57,6 +65,11 @@ else() ...@@ -57,6 +65,11 @@ else()
# Need to add HIAI runtime libs (libhiai.so) dependency # Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs}) target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
endif() endif()
if (LITE_WITH_RKNPU)
# Need to add RKNPU runtime libs dependency
target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
endif()
endif() endif()
endif() endif()
...@@ -67,8 +80,11 @@ if (WITH_TESTING) ...@@ -67,8 +80,11 @@ if (WITH_TESTING)
CUDA_DEPS ${cuda_kernels} CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}) MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels})
endif() endif()
if(LITE_WITH_FPGA) if(LITE_WITH_FPGA)
set(light_api_deps ${light_api_deps} ${fpga_deps}) set(light_api_deps ${light_api_deps} ${fpga_deps})
...@@ -80,6 +96,12 @@ if(LITE_WITH_BM) ...@@ -80,6 +96,12 @@ if(LITE_WITH_BM)
set(cxx_api_deps ${cxx_api_deps} ${bm_deps}) set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
endif() endif()
if(LITE_WITH_RKNPU)
set(light_api_deps ${light_api_deps} ${rknpu_deps})
set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
endif()
message(STATUS "get ops ${ops}") message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}") message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get CUDA kernels ${cuda_kernels}") message(STATUS "get CUDA kernels ${cuda_kernels}")
...@@ -87,7 +109,9 @@ message(STATUS "get Host kernels ${host_kernels}") ...@@ -87,7 +109,9 @@ message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get OpenCL kernels ${opencl_kernels}") message(STATUS "get OpenCL kernels ${opencl_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get APU kernels ${apu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}") message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}") message(STATUS "get MLU kernels ${mlu_kernels}")
...@@ -105,6 +129,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -105,6 +129,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}) FPGA_DEPS ${fpga_kernels})
...@@ -125,7 +151,9 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -125,7 +151,9 @@ lite_cc_library(light_api SRCS light_api.cc
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
...@@ -144,7 +172,9 @@ if(WITH_TESTING) ...@@ -144,7 +172,9 @@ if(WITH_TESTING)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
...@@ -200,7 +230,7 @@ if(WITH_TESTING) ...@@ -200,7 +230,7 @@ if(WITH_TESTING)
endif() endif()
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels}) set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels})
lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
DEPS ${lite_model_test_DEPS} DEPS ${lite_model_test_DEPS}
...@@ -246,6 +276,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) ...@@ -246,6 +276,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
--model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL) --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz) add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
# brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model # brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc # lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS}) # DEPS ${lite_model_test_DEPS})
...@@ -271,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -271,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}) BM_DEPS ${bm_kernels})
...@@ -289,6 +321,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc ...@@ -289,6 +321,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
DEPS light_api program mir_passes paddle_api_light DEPS light_api program mir_passes paddle_api_light
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL) ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
...@@ -298,6 +331,7 @@ lite_cc_test(test_apis SRCS apis_test.cc ...@@ -298,6 +331,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
...@@ -333,6 +367,8 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle ...@@ -333,6 +367,8 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
...@@ -352,8 +388,10 @@ if(NOT IOS) ...@@ -352,8 +388,10 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels})
...@@ -365,8 +403,10 @@ if(NOT IOS) ...@@ -365,8 +403,10 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels})
...@@ -378,8 +418,10 @@ if(NOT IOS) ...@@ -378,8 +418,10 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels})
...@@ -390,7 +432,9 @@ if(NOT IOS) ...@@ -390,7 +432,9 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
...@@ -401,19 +445,24 @@ if(NOT IOS) ...@@ -401,19 +445,24 @@ if(NOT IOS)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels}) CUDA_DEPS ${cuda_kernels})
lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${ops} ${host_kernels}
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
RKNPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
......
...@@ -13,7 +13,13 @@ ...@@ -13,7 +13,13 @@
// limitations under the License. // limitations under the License.
#include <gflags/gflags.h> #include <gflags/gflags.h>
#if !defined(_WIN32)
#include <sys/time.h> #include <sys/time.h>
#else
#include <windows.h>
#include "lite/backends/x86/port.h"
#endif
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include <time.h> #include <time.h>
#include <algorithm> #include <algorithm>
#include <cstdio> #include <cstdio>
...@@ -27,6 +33,9 @@ ...@@ -27,6 +33,9 @@
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
DEFINE_string(optimized_model_path,
"",
"the path of the model that is optimized by opt.");
DEFINE_string(model_dir, DEFINE_string(model_dir,
"", "",
"the path of the model, the model and param files is under " "the path of the model, the model and param files is under "
...@@ -61,10 +70,7 @@ DEFINE_int32(threads, 1, "threads num"); ...@@ -61,10 +70,7 @@ DEFINE_int32(threads, 1, "threads num");
DEFINE_string(result_filename, DEFINE_string(result_filename,
"result.txt", "result.txt",
"save the inference time to the file."); "save the inference time to the file.");
DEFINE_bool(run_model_optimize, DEFINE_bool(show_output, false, "Wether to show the output in shell.");
false,
"if set true, apply model_optimize_tool to "
"model and use optimized model to test. ");
namespace paddle { namespace paddle {
namespace lite_api { namespace lite_api {
...@@ -100,15 +106,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) { ...@@ -100,15 +106,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir; LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
} }
int64_t ShapeProduction(const std::vector<int64_t>& shape) {
int64_t num = 1;
for (auto i : shape) {
num *= i;
}
return num;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::vector<int64_t>& input_shape, void Run(const std::vector<int64_t>& input_shape,
const std::string& model_dir, const std::string& model_path,
const std::string model_name) { const std::string model_name) {
// set config and create predictor // set config and create predictor
lite_api::MobileConfig config; lite_api::MobileConfig config;
config.set_threads(FLAGS_threads); config.set_threads(FLAGS_threads);
config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode)); config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
config.set_model_from_file(model_dir + ".nb"); config.set_model_from_file(model_path);
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
...@@ -116,10 +130,7 @@ void Run(const std::vector<int64_t>& input_shape, ...@@ -116,10 +130,7 @@ void Run(const std::vector<int64_t>& input_shape,
auto input_tensor = predictor->GetInput(0); auto input_tensor = predictor->GetInput(0);
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto input_data = input_tensor->mutable_data<float>(); auto input_data = input_tensor->mutable_data<float>();
int input_num = 1; int64_t input_num = ShapeProduction(input_shape);
for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i];
}
if (FLAGS_input_img_path.empty()) { if (FLAGS_input_img_path.empty()) {
for (int i = 0; i < input_num; ++i) { for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f; input_data[i] = 1.f;
...@@ -167,26 +178,78 @@ void Run(const std::vector<int64_t>& input_shape, ...@@ -167,26 +178,78 @@ void Run(const std::vector<int64_t>& input_shape,
ofs << "average = " << std::setw(12) << avg_res; ofs << "average = " << std::setw(12) << avg_res;
ofs << std::endl; ofs << std::endl;
ofs.close(); ofs.close();
if (FLAGS_show_output) {
auto out_tensor = predictor->GetOutput(0);
auto* out_data = out_tensor->data<float>();
int64_t output_num = ShapeProduction(out_tensor->shape());
float max_value = out_data[0];
int max_index = 0;
for (int i = 0; i < output_num; i++) {
if (max_value < out_data[i]) {
max_value = out_data[i];
max_index = i;
}
}
LOG(INFO) << "max_value:" << max_value;
LOG(INFO) << "max_index:" << max_index;
LOG(INFO) << "output data[0:10]:";
for (int i = 0; i < 10; i++) {
LOG(INFO) << out_data[i];
}
}
} }
#endif #endif
} // namespace lite_api } // namespace lite_api
} // namespace paddle } // namespace paddle
void print_usage() {
std::string help_info =
"Usage: \n"
"./benchmark_bin \n"
" --optimized_model_path (The path of the model that is optimized\n"
" by opt. If the model is optimized, please set the param.) \n"
" type: string \n"
" --model_dir (The path of the model that is not optimized by opt,\n"
" the model and param files is under model_dir.) type: string \n"
" --model_filename (The filename of model file. When the model is\n "
" combined formate, please set model_file. Otherwise, it is not\n"
" necessary to set it.) type: string \n"
" --param_filename (The filename of param file, set param_file when\n"
" the model is combined formate. Otherwise, it is not necessary\n"
" to set it.) type: string \n"
" --input_shape (Set input shapes according to the model, separated by\n"
" colon and comma, such as 1,3,244,244) type: string\n"
" default: 1,3,224,224 \n"
" --input_img_path (The path of input image, if not set\n"
" input_img_path, the input will be 1.0.) type: string \n "
" --power_mode (Arm power mode: 0 for big cluster, 1 for little\n"
" cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
" --repeats (Repeats times) type: int32 default: 1 \n"
" --result_filename (Save the inference time to the file.) type: \n"
" string default: result.txt \n"
" --threads (Threads num) type: int32 default: 1 \n"
" --warmup (Warmup times) type: int32 default: 0 \n"
"Note that: \n"
" If load the optimized model, set optimized_model_path. Otherwise, \n"
" set model_dir, model_filename and param_filename according to \n"
" the model. \n";
LOG(INFO) << help_info;
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
// Check inputs
gflags::ParseCommandLineFlags(&argc, &argv, true); gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "") { bool is_opt_model = (FLAGS_optimized_model_path != "");
LOG(INFO) << "Please run ./benchmark_bin --help to obtain usage."; bool is_origin_model = (FLAGS_model_dir != "");
if (!is_origin_model && !is_opt_model) {
LOG(INFO) << "Input error, the model path should not be empty.\n";
print_usage();
exit(0); exit(0);
} }
if (FLAGS_model_dir.back() == '/') { // Get input shape
FLAGS_model_dir.pop_back();
}
std::size_t found = FLAGS_model_dir.find_last_of("/");
std::string model_name = FLAGS_model_dir.substr(found + 1);
std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> { auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape; std::vector<int64_t> shape;
std::string tmp_str = str_shape; std::string tmp_str = str_shape;
...@@ -202,19 +265,31 @@ int main(int argc, char** argv) { ...@@ -202,19 +265,31 @@ int main(int argc, char** argv) {
} }
return shape; return shape;
}; };
std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape); std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
// Output optimized model if needed // Get model_name and run_model_path
if (FLAGS_run_model_optimize) { std::string model_name;
paddle::lite_api::OutputOptModel(save_optimized_model_dir); std::string run_model_path;
if (is_origin_model) {
if (FLAGS_model_dir.back() == '/') {
FLAGS_model_dir.pop_back();
}
std::size_t found = FLAGS_model_dir.find_last_of("/");
model_name = FLAGS_model_dir.substr(found + 1);
std::string optimized_model_path = FLAGS_model_dir + "_opt2";
paddle::lite_api::OutputOptModel(optimized_model_path);
run_model_path = optimized_model_path + ".nb";
} else {
size_t found1 = FLAGS_optimized_model_path.find_last_of("/");
size_t found2 = FLAGS_optimized_model_path.find_last_of(".");
size_t len = found2 - found1 - 1;
model_name = FLAGS_optimized_model_path.substr(found1 + 1, len);
run_model_path = FLAGS_optimized_model_path;
} }
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK #ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model // Run test
std::string run_model_dir = paddle::lite_api::Run(input_shape, run_model_path, model_name);
FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
paddle::lite_api::Run(input_shape, run_model_dir, model_name);
#endif #endif
return 0; return 0;
} }
...@@ -292,9 +292,10 @@ void Predictor::Build(const cpp::ProgramDesc &desc, ...@@ -292,9 +292,10 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
program_desc_ = desc; program_desc_ = desc;
// `inner_places` is used to optimize passes // `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places; std::vector<Place> inner_places = valid_places;
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)); for (auto &valid_place : valid_places) {
inner_places.emplace_back( inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)); Place(TARGET(kHost), valid_place.precision, valid_place.layout));
}
// Analysis whether the modle is quantized. // Analysis whether the modle is quantized.
// For quantized model, add place(arm, int8) to inner_places // For quantized model, add place(arm, int8) to inner_places
......
...@@ -20,24 +20,32 @@ ...@@ -20,24 +20,32 @@
#include "lite/core/device_info.h" #include "lite/core/device_info.h"
#include "lite/core/version.h" #include "lite/core/version.h"
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
#include <omp.h> #include <omp.h>
#include "lite/backends/x86/mklml.h" #include "lite/backends/x86/mklml.h"
#endif #endif
namespace paddle { namespace paddle {
namespace lite { namespace lite {
void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config_ = config; config_ = config;
auto places = config.valid_places(); auto places = config.valid_places();
std::vector<std::string> passes{};
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
// if kCUDA is included in valid places, it should be initialized first, // if kCUDA is included in valid places, it should be initialized first,
// otherwise skip this step. // otherwise skip this step.
for (auto &p : places) { for (auto &p : places) {
if (p.target == TARGET(kCUDA)) { if (p.target == TARGET(kCUDA)) {
Env<TARGET(kCUDA)>::Init(); Env<TARGET(kCUDA)>::Init();
if (config_.multi_stream()) {
passes = {"multi_stream_analysis_pass"};
VLOG(3) << "add pass: " << passes[0];
}
break; break;
} }
} }
...@@ -51,7 +59,6 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -51,7 +59,6 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config.mlu_first_conv_std(), config.mlu_first_conv_std(),
config.mlu_input_layout()); config.mlu_input_layout());
#endif // LITE_WITH_MLU #endif // LITE_WITH_MLU
std::vector<std::string> passes{};
auto use_layout_preprocess_pass = auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS"); config.model_dir().find("OPENCL_PRE_PRECESS");
VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass; VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
...@@ -63,9 +70,8 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) { ...@@ -63,9 +70,8 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
raw_predictor_.Build(config, places, passes); raw_predictor_.Build(config, places, passes);
mode_ = config.power_mode(); mode_ = config.power_mode();
threads_ = config.threads(); threads_ = config.threads();
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
int num_threads = config.x86_math_library_num_threads(); int num_threads = config.x86_math_library_num_threads();
int real_num_threads = num_threads > 1 ? num_threads : 1; int real_num_threads = num_threads > 1 ? num_threads : 1;
paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads); paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
......
...@@ -29,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file, ...@@ -29,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file,
LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_); LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
} }
// For weight quantization of post training, load the int8/16 weights
// for optimized model, and dequant it to fp32.
DequantizeWeight(); DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_); BuildRuntimeProgram(cpp_program_desc_);
PrepareFeedFetch(); PrepareFeedFetch();
} }
...@@ -79,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) { ...@@ -79,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) {
if (element == input_names_.end()) { if (element == input_names_.end()) {
LOG(ERROR) << "Model do not have input named with: [" << name LOG(ERROR) << "Model do not have input named with: [" << name
<< "], model's inputs include:"; << "], model's inputs include:";
for (int i = 0; i < input_names_.size(); i++) { for (size_t i = 0; i < input_names_.size(); i++) {
LOG(ERROR) << "[" << input_names_[i] << "]"; LOG(ERROR) << "[" << input_names_[i] << "]";
} }
return nullptr; return nullptr;
...@@ -111,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() { ...@@ -111,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() {
auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0); auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
std::vector<cpp::OpDesc*> feeds; std::vector<cpp::OpDesc*> feeds;
std::vector<cpp::OpDesc*> fetchs; std::vector<cpp::OpDesc*> fetchs;
for (int i = 0; i < current_block->OpsSize(); i++) { for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i); auto op = current_block->GetOp<cpp::OpDesc>(i);
if (op->Type() == "feed") { if (op->Type() == "feed") {
feeds.push_back(op); feeds.push_back(op);
...@@ -121,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() { ...@@ -121,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() {
} }
input_names_.resize(feeds.size()); input_names_.resize(feeds.size());
output_names_.resize(fetchs.size()); output_names_.resize(fetchs.size());
for (int i = 0; i < feeds.size(); i++) { for (size_t i = 0; i < feeds.size(); i++) {
input_names_[feeds[i]->GetAttr<int>("col")] = input_names_[feeds[i]->GetAttr<int>("col")] =
feeds[i]->Output("Out").front(); feeds[i]->Output("Out").front();
} }
for (int i = 0; i < fetchs.size(); i++) { for (size_t i = 0; i < fetchs.size(); i++) {
output_names_[fetchs[i]->GetAttr<int>("col")] = output_names_[fetchs[i]->GetAttr<int>("col")] =
fetchs[i]->Input("X").front(); fetchs[i]->Input("X").front();
} }
...@@ -138,9 +141,6 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { ...@@ -138,9 +141,6 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
// 2. Create Instructs // 2. Create Instructs
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
using WaitListType =
std::unordered_map<decltype(static_cast<const void*>(nullptr)),
std::shared_ptr<cl::Event>>;
using OpenCLContext = Context<TargetType::kOpenCL>; using OpenCLContext = Context<TargetType::kOpenCL>;
std::unique_ptr<KernelContext> local_ctx(new KernelContext()); std::unique_ptr<KernelContext> local_ctx(new KernelContext());
local_ctx->As<OpenCLContext>().InitOnce(); local_ctx->As<OpenCLContext>().InitOnce();
...@@ -182,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) { ...@@ -182,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
} }
void LightPredictor::DequantizeWeight() { void LightPredictor::DequantizeWeight() {
#define PROCESS_CONV2D_DATA() \ #define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < h; ++i) { \ for (int64_t i = 0; i < ch; ++i) { \
for (int64_t j = 0; j < w; ++j) { \ for (int64_t j = 0; j < offset; ++j) { \
fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \ fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \
} \ } \
} }
#define PROCESS_FC_DATA() \ #define PROCESS_FC_DATA() \
for (int i = 0; i < input_tensor->numel(); i++) { \ for (int64_t i = 0; i < chin; i++) { \
*fp_data = scale_list[0] * (*int_data); \ for (int64_t j = 0; j < chout; j++) { \
++fp_data; \ fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \
++int_data; \ } \
} }
auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) {
bool result = false;
if (op_desc->HasAttr("quantization_type")) {
std::string type = op_desc->GetAttr<std::string>("quantization_type");
result = (type == "post_weight_abs_max") ||
(type == "post_weight_channel_wise_abs_max");
} else {
result = op_desc->HasAttr("quantize_weight_bits");
}
return result;
};
Tensor tmp_tensor; Tensor tmp_tensor;
CHECK(cpp_program_desc_.BlocksSize()); for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0); auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
for (size_t k = 0; k < main_block->OpsSize(); ++k) { for (size_t k = 0; k < block->OpsSize(); ++k) {
auto* op_desc = main_block->GetOp<cpp::OpDesc>(k); auto* op_desc = block->GetOp<cpp::OpDesc>(k);
if (op_desc->HasAttr("quantize_weight_bits")) { // weight quantized op if (is_weight_quantized_op(op_desc)) {
auto input_names = op_desc->input_vars(); auto input_names = op_desc->input_vars();
for (auto& input_name : input_names) { for (auto& input_name : input_names) {
std::string input_scale_name = input_name + "_quant_scale"; std::string input_scale_name = input_name + "_quant_scale";
if (op_desc->HasAttr(input_scale_name)) { // the input is quantized if (op_desc->HasAttr(input_scale_name)) { // the input is quantized
auto input_tensor = auto input_tensor =
scope_->FindVar(input_name)->GetMutable<lite::Tensor>(); scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
tmp_tensor.CopyDataFrom(*input_tensor); tmp_tensor.CopyDataFrom(*input_tensor);
auto scale_list = auto scale_list =
op_desc->GetAttr<std::vector<float>>(input_scale_name); op_desc->GetAttr<std::vector<float>>(input_scale_name);
int quantize_weight_bits =
op_desc->GetAttr<int>("quantize_weight_bits"); int quantize_weight_bits =
float* fp_data = input_tensor->mutable_data<float>(); op_desc->GetAttr<int>("quantize_weight_bits");
CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16);
std::string op_type = op_desc->Type(); float* fp_data = input_tensor->mutable_data<float>();
if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
int64_t h = input_tensor->dims()[0]; std::string op_type = op_desc->Type();
int64_t w = input_tensor->numel() / h; if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
CHECK_EQ(scale_list.size(), h); int64_t ch = input_tensor->dims()[0];
if (quantize_weight_bits == 8) { int64_t offset = input_tensor->numel() / ch;
const int8_t* int_data = tmp_tensor.data<int8_t>(); CHECK_EQ(scale_list.size(), ch);
PROCESS_CONV2D_DATA() if (quantize_weight_bits == 8) {
} else { const int8_t* int_data = tmp_tensor.data<int8_t>();
const int16_t* int_data = tmp_tensor.data<int16_t>(); PROCESS_CONV2D_DATA()
PROCESS_CONV2D_DATA() } else {
} const int16_t* int_data = tmp_tensor.data<int16_t>();
} else if (op_type == "fc" || op_type == "mul") { PROCESS_CONV2D_DATA()
if (quantize_weight_bits == 8) { }
const int8_t* int_data = tmp_tensor.data<int8_t>(); } else if (op_type == "fc" || op_type == "mul") {
PROCESS_FC_DATA() int64_t chin = input_tensor->dims()[0];
} else { int64_t chout = input_tensor->dims()[1];
const int16_t* int_data = tmp_tensor.data<int16_t>(); CHECK_EQ(scale_list.size(), chout);
PROCESS_FC_DATA() if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_FC_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_FC_DATA()
}
} }
} }
} }
......
...@@ -37,11 +37,11 @@ TEST(LightAPI, load) { ...@@ -37,11 +37,11 @@ TEST(LightAPI, load) {
const std::vector<std::string> inputs = predictor.GetInputNames(); const std::vector<std::string> inputs = predictor.GetInputNames();
LOG(INFO) << "input size: " << inputs.size(); LOG(INFO) << "input size: " << inputs.size();
for (int i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
LOG(INFO) << "inputnames: " << inputs[i]; LOG(INFO) << "inputnames: " << inputs[i];
} }
const std::vector<std::string> outputs = predictor.GetOutputNames(); const std::vector<std::string> outputs = predictor.GetOutputNames();
for (int i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
LOG(INFO) << "outputnames: " << outputs[i]; LOG(INFO) << "outputnames: " << outputs[i];
} }
......
...@@ -293,13 +293,13 @@ int main(int argc, char** argv) { ...@@ -293,13 +293,13 @@ int main(int argc, char** argv) {
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape); std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes; std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) { for (size_t i = 0; i < str_input_shapes.size(); ++i) {
input_shapes.push_back(get_shape(str_input_shapes[i])); input_shapes.push_back(get_shape(str_input_shapes[i]));
} }
std::vector<std::string> str_input_shapes_0 = std::vector<std::string> str_input_shapes_0 =
split_string(FLAGS_input_shape_0); split_string(FLAGS_input_shape_0);
std::vector<std::vector<int64_t>> input_shapes_0; std::vector<std::vector<int64_t>> input_shapes_0;
for (int i = 0; i < str_input_shapes_0.size(); ++i) { for (size_t i = 0; i < str_input_shapes_0.size(); ++i) {
input_shapes_0.push_back(get_shape(str_input_shapes_0[i])); input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
} }
......
...@@ -44,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -44,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) { const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(load_model_dir); config.set_model_dir(load_model_dir);
#ifdef LITE_WITH_X86
config.set_valid_places({Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kInt64)},
Place{TARGET(kHost), PRECISION(kFloat)}});
#else
config.set_valid_places({ config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)},
}); });
#endif
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model // delete old optimized model
...@@ -198,7 +204,7 @@ int main(int argc, char** argv) { ...@@ -198,7 +204,7 @@ int main(int argc, char** argv) {
LOG(INFO) << "input shapes: " << FLAGS_input_shape; LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape); std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes; std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) { for (size_t i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i]; LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i])); input_shapes.push_back(get_shape(str_input_shapes[i]));
} }
......
...@@ -310,7 +310,7 @@ int main(int argc, char** argv) { ...@@ -310,7 +310,7 @@ int main(int argc, char** argv) {
LOG(INFO) << "input shapes: " << FLAGS_input_shape; LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape); std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes; std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) { for (size_t i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i]; LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i])); input_shapes.push_back(get_shape(str_input_shapes[i]));
} }
......
...@@ -114,7 +114,7 @@ void detect_object(const float* dout, ...@@ -114,7 +114,7 @@ void detect_object(const float* dout,
} }
std::string name = FLAGS_out_txt + "_accu.txt"; std::string name = FLAGS_out_txt + "_accu.txt";
FILE* fp = fopen(name.c_str(), "w"); FILE* fp = fopen(name.c_str(), "w");
for (int i = 0; i < objects.size(); ++i) { for (size_t i = 0; i < objects.size(); ++i) {
Object object = objects.at(i); Object object = objects.at(i);
if (object.prob > thresh && object.x > 0 && object.y > 0 && if (object.prob > thresh && object.x > 0 && object.y > 0 &&
object.width > 0 && object.height > 0) { object.width > 0 && object.height > 0) {
...@@ -324,7 +324,7 @@ int main(int argc, char** argv) { ...@@ -324,7 +324,7 @@ int main(int argc, char** argv) {
LOG(INFO) << "input shapes: " << FLAGS_input_shape; LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape); std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes; std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) { for (size_t i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i]; LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i])); input_shapes.push_back(get_shape(str_input_shapes[i]));
} }
......
...@@ -104,13 +104,21 @@ std::vector<Place> ParserValidPlaces() { ...@@ -104,13 +104,21 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back( valid_places.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") { } else if (target_repr == "x86") {
valid_places.emplace_back(TARGET(kX86)); valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)});
valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
} else if (target_repr == "npu") { } else if (target_repr == "npu") {
valid_places.emplace_back(TARGET(kNPU)); valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU)); valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") { } else if (target_repr == "mlu") {
valid_places.emplace_back(TARGET(kMLU)); valid_places.emplace_back(TARGET(kMLU));
} else if (target_repr == "rknpu") {
valid_places.emplace_back(TARGET(kRKNPU));
valid_places.emplace_back(
TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
} else if (target_repr == "apu") {
valid_places.emplace_back(
Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
} else { } else {
LOG(FATAL) << lite::string_format( LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag " "Wrong target '%s' found, please check the command flag "
...@@ -187,6 +195,8 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -187,6 +195,8 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kFPGA", "kFPGA",
"kNPU", "kNPU",
"kXPU", "kXPU",
"kRKNPU",
"kAPU",
"kAny", "kAny",
"kUnk"}; "kUnk"};
int maximum_optype_length = 0; int maximum_optype_length = 0;
...@@ -251,16 +261,16 @@ void PrintHelpInfo() { ...@@ -251,16 +261,16 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n" " `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n" " `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n" " `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" `--record_tailoring_info=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n" " Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of " " `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n" "Paddle-Lite\n"
" `--print_supported_ops=true " " `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
" Display valid operators of input targets\n" " Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> " " `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
" Display operators in the input model\n"; " Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl; << help_info << std::endl;
......
...@@ -63,6 +63,13 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { ...@@ -63,6 +63,13 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.emplace_back(TARGET(kNPU)); valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU)); valid_places_.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") {
valid_places_.emplace_back(TARGET(kRKNPU));
valid_places_.emplace_back(
TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
} else if (target_repr == "apu") {
valid_places_.emplace_back(
Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
} else { } else {
LOG(FATAL) << lite::string_format( LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag " "Wrong target '%s' found, please check the command flag "
...@@ -183,7 +190,7 @@ void OptBase::PrintHelpInfo() { ...@@ -183,7 +190,7 @@ void OptBase::PrintHelpInfo() {
" `set_param_file(param_file_path)`\n" " `set_param_file(param_file_path)`\n"
" `set_model_type(protobuf|naive_buffer)`\n" " `set_model_type(protobuf|naive_buffer)`\n"
" `set_optimize_out(output_optimize_model_dir)`\n" " `set_optimize_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu)`\n" " `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" `run_optimize(false|true)`\n" " `run_optimize(false|true)`\n"
" ` ----fasle&true refer to whether to record ops info for " " ` ----fasle&true refer to whether to record ops info for "
"tailoring lib, false by default`\n" "tailoring lib, false by default`\n"
...@@ -208,6 +215,8 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) { ...@@ -208,6 +215,8 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
"kFPGA", "kFPGA",
"kNPU", "kNPU",
"kXPU", "kXPU",
"kRKNPU",
"kAPU",
"kAny", "kAny",
"kUnk"}; "kUnk"};
// Get the lengh of the first column: maximum length of the op_type // Get the lengh of the first column: maximum length of the op_type
......
...@@ -136,6 +136,9 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -136,6 +136,9 @@ class LITE_API CxxConfig : public ConfigBase {
#ifdef LITE_WITH_X86 #ifdef LITE_WITH_X86
int x86_math_library_math_threads_ = 1; int x86_math_library_math_threads_ = 1;
#endif #endif
#ifdef LITE_WITH_CUDA
bool multi_stream_{false};
#endif
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
int mlu_core_number_{1}; int mlu_core_number_{1};
...@@ -171,6 +174,10 @@ class LITE_API CxxConfig : public ConfigBase { ...@@ -171,6 +174,10 @@ class LITE_API CxxConfig : public ConfigBase {
return x86_math_library_math_threads_; return x86_math_library_math_threads_;
} }
#endif #endif
#ifdef LITE_WITH_CUDA
void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
int multi_stream() const { return multi_stream_; }
#endif
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
// set MLU core version, which is used when compiling MLU kernels // set MLU core version, which is used when compiling MLU kernels
......
...@@ -36,11 +36,11 @@ TEST(CxxApi, run) { ...@@ -36,11 +36,11 @@ TEST(CxxApi, run) {
auto inputs = predictor->GetInputNames(); auto inputs = predictor->GetInputNames();
LOG(INFO) << "input size: " << inputs.size(); LOG(INFO) << "input size: " << inputs.size();
for (int i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
LOG(INFO) << "inputnames: " << inputs[i]; LOG(INFO) << "inputnames: " << inputs[i];
} }
auto outputs = predictor->GetOutputNames(); auto outputs = predictor->GetOutputNames();
for (int i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
LOG(INFO) << "outputnames: " << outputs[i]; LOG(INFO) << "outputnames: " << outputs[i];
} }
auto input_tensor = predictor->GetInputByName(inputs[0]); auto input_tensor = predictor->GetInputByName(inputs[0]);
......
...@@ -18,20 +18,21 @@ ...@@ -18,20 +18,21 @@
*/ */
#pragma once #pragma once
#define USE_LITE_OP(op_type__) \ // some platform-independent defintion
extern int touch_op_##op_type__(); \ #include "lite/utils/macros.h"
int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
touch_op_##op_type__(); #define USE_LITE_OP(op_type__) \
extern int touch_op_##op_type__(); \
int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();
#define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \ #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
extern int touch_##op_type__##target__##precision__##layout__##alias__(); \ extern int touch_##op_type__##target__##precision__##layout__##alias__(); \
int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \ int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
__attribute__((unused)) = \ UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();
touch_##op_type__##target__##precision__##layout__##alias__();
#define USE_MIR_PASS(name__) \ #define USE_MIR_PASS(name__) \
extern bool mir_pass_registry##name__##_fake(); \ extern bool mir_pass_registry##name__##_fake(); \
static bool mir_pass_usage##name__ __attribute__((unused)) = \ static bool mir_pass_usage##name__ UNUSED = \
mir_pass_registry##name__##_fake(); mir_pass_registry##name__##_fake();
#define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__ #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
...@@ -72,7 +72,9 @@ const std::string& TargetToStr(TargetType target) { ...@@ -72,7 +72,9 @@ const std::string& TargetToStr(TargetType target) {
"npu", "npu",
"xpu", "xpu",
"bm", "bm",
"mlu"}; "mlu",
"rknpu",
"apu"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -112,8 +114,10 @@ const std::string& TargetRepr(TargetType target) { ...@@ -112,8 +114,10 @@ const std::string& TargetRepr(TargetType target) {
"kFPGA", "kFPGA",
"kNPU", "kNPU",
"kXPU", "kXPU",
"kBM",
"kMLU", "kMLU",
"kBM"}; "kRKNPU",
"kAPU"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -156,6 +160,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) { ...@@ -156,6 +160,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kXPU), TARGET(kXPU),
TARGET(kBM), TARGET(kBM),
TARGET(kMLU), TARGET(kMLU),
TARGET(kAPU),
TARGET(kFPGA)}); TARGET(kFPGA)});
if (target == TARGET(kAny)) { if (target == TARGET(kAny)) {
return valid_set; return valid_set;
......
...@@ -49,13 +49,15 @@ enum class TargetType : int { ...@@ -49,13 +49,15 @@ enum class TargetType : int {
kCUDA = 3, kCUDA = 3,
kARM = 4, kARM = 4,
kOpenCL = 5, kOpenCL = 5,
kAny = 6, // any target
kFPGA = 7, kFPGA = 7,
kNPU = 8, kNPU = 8,
kXPU = 9, kXPU = 9,
kBM = 10, kBM = 10,
kMLU = 11, kMLU = 11,
kAny = 6, // any target kRKNPU = 12,
NUM = 12, // number of fields. kAPU = 13,
NUM = 14, // number of fields.
}; };
enum class PrecisionType : int { enum class PrecisionType : int {
kUnk = 0, kUnk = 0,
......
...@@ -42,12 +42,14 @@ USE_MIR_PASS(type_precision_cast_pass); ...@@ -42,12 +42,14 @@ USE_MIR_PASS(type_precision_cast_pass);
USE_MIR_PASS(type_layout_cast_pass); USE_MIR_PASS(type_layout_cast_pass);
USE_MIR_PASS(type_layout_cast_preprocess_pass); USE_MIR_PASS(type_layout_cast_preprocess_pass);
USE_MIR_PASS(memory_optimize_pass); USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(multi_stream_analysis_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass); USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass); USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass); USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
...@@ -17,8 +17,12 @@ execute_process( ...@@ -17,8 +17,12 @@ execute_process(
OUTPUT_VARIABLE PADDLE_LITE_COMMIT OUTPUT_VARIABLE PADDLE_LITE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_STRIP_TRAILING_WHITESPACE
) )
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in if(APPLE)
${CMAKE_CURRENT_BINARY_DIR}/setup.py) configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
else()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
endif()
add_subdirectory(pybind) add_subdirectory(pybind)
#add_subdirectory(interface) #add_subdirectory(interface)
...@@ -11,3 +11,12 @@ ...@@ -11,3 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import sys
if os.name =='nt':
current_path = os.path.abspath(os.path.dirname(__file__))
third_lib_path = current_path + os.sep + 'libs'
os.environ['path'] = third_lib_path+ ';' + os.environ['path']
sys.path.insert(0, third_lib_path)
...@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH)
set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base) set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
endif() endif()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) if(WIN32)
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(lite_pybind ${os_dependency_modules})
else()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
endif(WIN32)
if (LITE_ON_TINY_PUBLISH) if (LITE_ON_TINY_PUBLISH)
set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
endif() endif()
...@@ -183,6 +183,8 @@ void BindLitePlace(py::module *m) { ...@@ -183,6 +183,8 @@ void BindLitePlace(py::module *m) {
.value("FPGA", TargetType::kFPGA) .value("FPGA", TargetType::kFPGA)
.value("NPU", TargetType::kNPU) .value("NPU", TargetType::kNPU)
.value("MLU", TargetType::kMLU) .value("MLU", TargetType::kMLU)
.value("RKNPU", TargetType::kRKNPU)
.value("APU", TargetType::kAPU)
.value("Any", TargetType::kAny); .value("Any", TargetType::kAny);
// PrecisionType // PrecisionType
......
...@@ -34,20 +34,27 @@ else: ...@@ -34,20 +34,27 @@ else:
# core lib of paddlelite is stored as lite.so # core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so']} PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
# put all thirdparty libraries in paddlelite.libs # put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = [] PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
if '${WITH_MKL}' == 'ON': if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] if os.name != 'nt':
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
else:
PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll']
shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
# link lite.so to paddlelite.libs # link lite.so to paddlelite.libs
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ if os.name != 'nt':
/inference_lite_lib/python/install/lite/lite.so" COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
if os.system(COMMAND) != 0: /inference_lite_lib/python/install/lite/lite.so"
raise Exception("patch third_party libs failed, command: %s" % COMMAND) if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
# remove unused paddle/libs/__init__.py # remove unused paddle/libs/__init__.py
if os.path.isfile(LIB_PATH+'/__init__.py'): if os.path.isfile(LIB_PATH+'/__init__.py'):
...@@ -61,6 +68,14 @@ PACKAGE_DIR = { ...@@ -61,6 +68,14 @@ PACKAGE_DIR = {
'paddlelite': LITE_PATH 'paddlelite': LITE_PATH
} }
if os.name == 'nt':
# fix the path separator under windows
fix_package_dir = {}
for k, v in PACKAGE_DIR.items():
fix_package_dir[k] = v.replace('/', '\\')
PACKAGE_DIR = fix_package_dir
setup( setup(
name='paddlelite', name='paddlelite',
version=PADDLELITE_VERSION, version=PADDLELITE_VERSION,
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# module of pack whl installer for Paddle-lite
import shutil
import os
from setuptools import setup, Distribution
class BinaryDistribution(Distribution):
'binary distribution'
def has_ext_modules(foo):
return True
# get paddle-lite version, if it's not based on a release tag, we use commit id instead
PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
if PADDLELITE_TAG == "":
PADDLELITE_VERSION = PADDLELITE_COMMITE
else:
PADDLELITE_VERSION = PADDLELITE_TAG
# core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so']}
# put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib']
# link lite.so to paddlelite.libs
COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
# remove unused paddle/libs/__init__.py
if os.path.isfile(LIB_PATH+'/__init__.py'):
os.remove(LIB_PATH+'/__init__.py')
# set dir path of each package
PACKAGE_DIR = {
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddlelite.libs': LIB_PATH,
'paddlelite': LITE_PATH
}
setup(
name='paddlelite',
version=PADDLELITE_VERSION,
description='Paddle-Lite Library',
packages=['paddlelite', 'paddlelite.libs'],
package_dir=PACKAGE_DIR,
package_data=PACKAGE_DATA,
distclass=BinaryDistribution
)
...@@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) { ...@@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -69,7 +69,7 @@ TEST(CXXApi, test_lite_googlenet) { ...@@ -69,7 +69,7 @@ TEST(CXXApi, test_lite_googlenet) {
for (size_t i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5); EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5);
} }
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
} }
......
...@@ -15,7 +15,12 @@ ...@@ -15,7 +15,12 @@
#pragma once #pragma once
#include <gflags/gflags.h> #include <gflags/gflags.h>
#if !defined(_WIN32)
#include <sys/time.h> #include <sys/time.h>
#else
#include <windows.h>
#include "lite/backends/x86/port.h"
#endif
#include <time.h> #include <time.h>
#include <cmath> #include <cmath>
......
...@@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) { ...@@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -69,13 +69,13 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) { ...@@ -69,13 +69,13 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
0.0010612885, 0.00089107914, 0.0010112736, 0.00097655767})); 0.0010612885, 0.00089107914, 0.0010112736, 0.00097655767}));
auto out = predictor->GetOutput(0); auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
int step = 50; int step = 50;
for (int i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
for (int j = 0; j < results[i].size(); ++j) { for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)], EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j], results[i][j],
1e-6); 1e-6);
......
...@@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { ...@@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -68,13 +68,13 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { ...@@ -68,13 +68,13 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986, 0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986,
0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722})); 0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722}));
auto out = predictor->GetOutput(0); auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
int step = 50; int step = 50;
for (int i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
for (int j = 0; j < results[i].size(); ++j) { for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)], EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j], results[i][j],
1e-6); 1e-6);
......
...@@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { ...@@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -69,13 +69,13 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { ...@@ -69,13 +69,13 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
0.0070957416, 0.0016094646, 0.0018807327, 0.00010506048, 0.0070957416, 0.0016094646, 0.0018807327, 0.00010506048,
6.823785e-05, 0.00012269315, 0.0007806194, 0.00022354358})); 6.823785e-05, 0.00012269315, 0.0007806194, 0.00022354358}));
auto out = predictor->GetOutput(0); auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
int step = 50; int step = 50;
for (int i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
for (int j = 0; j < results[i].size(); ++j) { for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)], EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j], results[i][j],
1e-6); 1e-6);
......
...@@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) { ...@@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -69,13 +69,13 @@ TEST(Resnet50, test_resnet50_lite_x86) { ...@@ -69,13 +69,13 @@ TEST(Resnet50, test_resnet50_lite_x86) {
0.006387163, 0.0037145028, 0.0012812682, 0.00045948103, 0.006387163, 0.0037145028, 0.0012812682, 0.00045948103,
0.00013535398, 0.0002483765, 0.00076759676, 0.0002773295})); 0.00013535398, 0.0002483765, 0.00076759676, 0.0002773295}));
auto out = predictor->GetOutput(0); auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
int step = 50; int step = 50;
for (int i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
for (int j = 0; j < results[i].size(); ++j) { for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)], EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j], results[i][j],
1e-6); 1e-6);
......
...@@ -232,8 +232,8 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -232,8 +232,8 @@ void TestModel(const std::vector<Place>& valid_places,
for (int i = 0; i < outs->numel(); ++i) { for (int i = 0; i < outs->numel(); ++i) {
LOG(INFO) << o_data[i]; LOG(INFO) << o_data[i];
} }
for (int i = 0; i < lod.size(); ++i) { for (size_t i = 0; i < lod.size(); ++i) {
for (int j = 0; j < lod[i].size(); ++j) { for (size_t j = 0; j < lod[i].size(); ++j) {
LOG(INFO) << lod[i][j]; LOG(INFO) << lod[i][j];
} }
} }
......
...@@ -8,3 +8,5 @@ add_subdirectory(npu) ...@@ -8,3 +8,5 @@ add_subdirectory(npu)
add_subdirectory(xpu) add_subdirectory(xpu)
add_subdirectory(mlu) add_subdirectory(mlu)
add_subdirectory(bm) add_subdirectory(bm)
add_subdirectory(apu)
add_subdirectory(rknpu)
if(NOT LITE_WITH_APU)
return()
endif()
lite_cc_library(device_apu SRCS device.cc)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/apu/device.h"
#include <dlfcn.h>
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace apu {
inline void* LoadFunc(void* libHandle, const char* name) {
CHECK(libHandle != nullptr);
CHECK(name != nullptr);
void* fn = dlsym(libHandle, name);
if (fn == nullptr) {
LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
<< "] Because " << dlerror();
}
return fn;
}
NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) {
typedef int (*NeuronCompilation_create)(NeuronModel * model,
NeuronCompilation * *compilation);
typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation);
typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation);
#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
FUNC_NAME VARIABLE_NAME = \
reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create)
LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free)
LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish)
#undef LOAD_FUNCTIONS
int neuron_errCode = 0;
NeuronCompilation* compilation = NULL;
VLOG(3) << "[APU] Compile model";
neuron_errCode = (*neuron_compilation_create)(model, &compilation);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode;
return nullptr;
}
neuron_errCode = (*neuron_compilation_finish)(compilation);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "[APU] compile failed! " << neuron_errCode;
return nullptr;
}
VLOG(3) << "[APU] Build done";
return compilation;
}
} // namespace apu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "NeuronAdapter.h" // NOLINT
namespace paddle {
namespace lite {
namespace apu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() {}
NeuronCompilation* Build(void* libHandle, NeuronModel* model);
};
} // namespace apu
} // namespace lite
} // namespace paddle
...@@ -744,6 +744,15 @@ void act_reciprocal<float>(const float* din, ...@@ -744,6 +744,15 @@ void act_reciprocal<float>(const float* din,
} }
} }
template <>
void act_abs<float>(const float* din, float* dout, int size, int threads) {
for (int i = 0; i < size; ++i) {
dout[0] = (din[0] > 0 ? din[0] : -din[0]);
din++;
dout++;
}
}
#ifdef LITE_WITH_TRAIN #ifdef LITE_WITH_TRAIN
template <> template <>
void act_square_grad(const float* din, void act_square_grad(const float* din,
......
...@@ -83,6 +83,9 @@ void act_hard_swish(const T* din, ...@@ -83,6 +83,9 @@ void act_hard_swish(const T* din,
template <typename T> template <typename T>
void act_reciprocal(const T* din, T* dout, int size, int threads); void act_reciprocal(const T* din, T* dout, int size, int threads);
template <typename T>
void act_abs(const T* din, T* dout, int size, int threads);
#ifdef LITE_WITH_TRAIN #ifdef LITE_WITH_TRAIN
template <typename T> template <typename T>
void act_square_grad( void act_square_grad(
......
...@@ -16,46 +16,3 @@ ...@@ -16,46 +16,3 @@
#include <algorithm> #include <algorithm>
#include <limits> #include <limits>
#include <memory> #include <memory>
#include "lite/backends/arm/math/funcs.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void concat_func(const std::vector<lite::Tensor *> &input,
const int axis,
lite::Tensor *output) {
int64_t concat_input_size = 1;
int64_t num_cancats = 1;
auto dim_0 = input[0]->dims();
size_t num = input.size();
for (int i = axis + 1; i < dim_0.size(); i++) {
concat_input_size *= dim_0[i];
}
for (int i = 0; i < axis; i++) {
num_cancats *= dim_0[i];
}
float *dst_ptr = output->mutable_data<float>();
const int out_concat_axis = output->dims()[axis];
int64_t offset_concat_axis = 0;
int64_t out_sum = out_concat_axis * concat_input_size;
for (int n = 0; n < num; n++) {
auto dims = input[n]->dims();
const float *src_ptr = input[n]->data<float>();
int64_t in_concat_axis = dims[axis];
float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
int64_t in_sum = in_concat_axis * concat_input_size;
for (int i = 0; i < num_cancats; i++) {
std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
dout_ptr += out_sum;
src_ptr += in_sum;
}
offset_concat_axis += in_concat_axis;
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
...@@ -25,9 +25,39 @@ namespace lite { ...@@ -25,9 +25,39 @@ namespace lite {
namespace arm { namespace arm {
namespace math { namespace math {
void concat_func(const std::vector<lite::Tensor *> &input, template <typename T>
void concat_func(const std::vector<lite::Tensor*>& input,
const int axis, const int axis,
lite::Tensor *output); lite::Tensor* output) {
size_t num = input.size();
auto dim_0 = input[0]->dims();
int64_t concat_input_size = 1;
int64_t num_cancats = 1;
for (int i = axis + 1; i < dim_0.size(); i++) {
concat_input_size *= dim_0[i];
}
for (int i = 0; i < axis; i++) {
num_cancats *= dim_0[i];
}
auto* dst_ptr = output->mutable_data<T>();
const int out_concat_axis = output->dims()[axis];
int64_t offset_concat_axis = 0;
int64_t out_sum = out_concat_axis * concat_input_size;
for (int n = 0; n < num; n++) {
auto dims = input[n]->dims();
auto* src_ptr = input[n]->data<T>();
int64_t in_concat_axis = dims[axis];
auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
int64_t in_sum = in_concat_axis * concat_input_size;
for (int i = 0; i < num_cancats; i++) {
std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum);
dout_ptr += out_sum;
src_ptr += in_sum;
}
offset_concat_axis += in_concat_axis;
}
}
} // namespace math } // namespace math
} // namespace arm } // namespace arm
......
...@@ -198,6 +198,23 @@ void reduce_mean_hw<float>(const float* src, ...@@ -198,6 +198,23 @@ void reduce_mean_hw<float>(const float* src,
reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in); reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
} }
template <>
void mean_grad<float>(const float* out_grad, float* in_grad, int size) {
float grad = out_grad[0] / size;
float32x4_t grad_v = vdupq_n_f32(grad);
int loop = size >> 2;
int remain = size & 3;
#pragma omp parallel for
for (int i = 0; i < loop; ++i) {
vst1q_f32(in_grad, grad_v);
in_grad += 4;
}
for (int i = 0; i < remain; ++i) {
in_grad[i] = grad;
}
}
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -83,6 +83,9 @@ void reduce_mean_all(const T* src, ...@@ -83,6 +83,9 @@ void reduce_mean_all(const T* src,
int height_in, int height_in,
int width_in); int width_in);
template <typename T>
void mean_grad(const T* out_grad, T* in_grad, int size);
} // namespace math } // namespace math
} // namespace arm } // namespace arm
} // namespace lite } // namespace lite
......
...@@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES) ...@@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps}) nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps})
nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps}) nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps})
lite_cc_library(cuda_context SRCS context.cc DEPS device_info)
add_subdirectory(math) add_subdirectory(math)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/context.h"
namespace paddle {
namespace lite {} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/cuda/blas.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/target_wrapper.h"
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
template <TargetType Type>
class Context;
using CUDAContext = Context<TargetType::kCUDA>;
// Only works with CUDA kernels.
template <>
class Context<TargetType::kCUDA> {
public:
typename Env<TargetType::kCUDA>::Devs& devs =
Env<TargetType::kCUDA>::Global();
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {
if (devs.size() > 0) {
cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
} else {
LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
}
}
void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
CHECK_GT(devs.size(), 0UL)
<< "Env is not initialized or current target is not exit!";
if (dev_id >= static_cast<int>(devs.size())) {
LOG(WARNING) << "device index exceeds the number of devices, set to "
"default device(0)!";
device_id_ = 0;
} else {
device_id_ = dev_id;
}
if (io_stream_id >= devs[dev_id].max_stream()) {
LOG(WARNING) << "data stream index exceeds the maximum stream number, "
"set to default stream(0)!";
io_stream_id = 0;
}
if (exec_stream_id >= devs[dev_id].max_stream()) {
LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
"set to default stream(0)!";
exec_stream_id = 0;
}
exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
io_stream_ = devs[dev_id].io_streams()[io_stream_id];
exec_stream_id_ = exec_stream_id;
io_stream_id_ = io_stream_id;
need_sync_ = false;
}
void CopySharedTo(CUDAContext* ctx) {
CHECK(ctx);
CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
ctx->cublas_fp32_ = cublas_fp32_;
}
const cudaStream_t& exec_stream() const { return exec_stream_; }
void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
const cudaStream_t& io_stream() const { return io_stream_; }
void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
cublas_fp32_ = cublas_fp32;
}
const std::vector<cudaEvent_t>& input_events() { return input_events_; }
void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
input_events_.clear();
input_events_.assign(input_events.begin(), input_events.end());
}
const std::vector<cudaEvent_t>& output_events() { return output_events_; }
void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
output_events_.clear();
output_events_.assign(output_events.begin(), output_events.end());
}
std::vector<cudaStream_t> all_exec_streams() {
int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
return devs[dev_id].exec_streams();
}
void SetSyncStreams(const std::vector<int>& nums) {
sync_streams_.clear();
std::vector<cudaStream_t> exec_streams = all_exec_streams();
for (size_t i = 0; i < nums.size(); ++i) {
CHECK(nums[i] >= 0 && nums[i] < static_cast<int>(exec_streams.size()))
<< "streams id is not valid";
sync_streams_.push_back(exec_streams[nums[i]]);
}
InitSyncEvents(nums.size());
}
void InitSyncEvents(const int num) {
sync_events_.clear();
for (int i = 0; i < num; ++i) {
cudaEvent_t eve;
TargetWrapperCuda::CreateEventWithFlags(&eve);
sync_events_.push_back(eve);
}
}
void SetNeedSync(bool sync) { need_sync_ = sync; }
bool need_sync() const { return need_sync_; }
void Sync() {
CHECK_EQ(sync_streams_.size(), sync_events_.size());
for (size_t i = 0; i < sync_events_.size(); ++i) {
TargetWrapperCuda::RecordEvent(sync_events_[i], sync_streams_[i]);
TargetWrapperCuda::StreamSync(exec_stream_, sync_events_[i]);
}
}
std::string name() const { return "CUDAContext"; }
CUDAContext& operator=(const CUDAContext& context) {
this->Init(
context.device_id_, context.exec_stream_id_, context.io_stream_id_);
cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
return *this;
}
private:
int device_id_;
// overall information
int exec_stream_id_;
int io_stream_id_;
cudaStream_t exec_stream_;
cudaStream_t io_stream_;
// not thread-safe, should allocate for each thread.
std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
// kernel information
std::vector<cudaEvent_t> input_events_;
std::vector<cudaEvent_t> output_events_;
// multi stream sync.
std::vector<cudaStream_t> sync_streams_;
std::vector<cudaEvent_t> sync_events_;
bool need_sync_;
};
} // namespace lite
} // namespace paddle
...@@ -58,7 +58,7 @@ void CLContext::AddKernel(const std::string &kernel_name, ...@@ -58,7 +58,7 @@ void CLContext::AddKernel(const std::string &kernel_name,
auto program = GetProgram(file_name, options); auto program = GetProgram(file_name, options);
VLOG(3) << " --- end get program --- "; VLOG(3) << " --- end get program --- ";
VLOG(3) << " --- to create kernel: " << kernel_name << " --- "; VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
std::unique_ptr<cl::Kernel> kernel( std::shared_ptr<cl::Kernel> kernel(
new cl::Kernel(program, kernel_name.c_str(), &status)); new cl::Kernel(program, kernel_name.c_str(), &status));
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
VLOG(3) << " --- end create kernel --- "; VLOG(3) << " --- end create kernel --- ";
......
...@@ -29,13 +29,14 @@ class CLContext { ...@@ -29,13 +29,14 @@ class CLContext {
public: public:
~CLContext() { ~CLContext() {
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
clReleaseKernel(kernels_[kidx]->get()); // Note(ysh329): Don't need `clReleaseKernel`
kernels_[kidx].reset(); kernels_[kidx].reset();
} }
kernels_.clear(); kernels_.clear();
kernel_offset_.clear(); kernel_offset_.clear();
for (auto &p : programs_) { for (auto &p : programs_) {
clReleaseProgram(p.second->get()); // Note(ysh329): Dont't need `clReleaseProgram`
p.second.reset();
} }
programs_.clear(); programs_.clear();
LOG(INFO) << "release cl::Program, cl::Kernel finished."; LOG(INFO) << "release cl::Program, cl::Kernel finished.";
...@@ -66,9 +67,10 @@ class CLContext { ...@@ -66,9 +67,10 @@ class CLContext {
int divitor = 2); int divitor = 2);
// cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size, // cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
// size_t max_work_size); // size_t max_work_size);
private: private:
std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_; std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
std::vector<std::unique_ptr<cl::Kernel>> kernels_; std::vector<std::shared_ptr<cl::Kernel>> kernels_;
std::map<std::string, int> kernel_offset_; std::map<std::string, int> kernel_offset_;
}; };
......
...@@ -54,10 +54,10 @@ __kernel void bilinear_interp(__read_only image2d_t input, ...@@ -54,10 +54,10 @@ __kernel void bilinear_interp(__read_only image2d_t input,
if (ceil_h > in_dims_h - 1) { if (ceil_h > in_dims_h - 1) {
ceil_h = in_dims_h- 1; ceil_h = in_dims_h- 1;
} }
float wight0_w = center_w - floor_w; CL_DTYPE wight0_w = center_w - floor_w;
float wight0_h = center_h - floor_h; CL_DTYPE wight0_h = center_h - floor_h;
float wight1_w = 1.0 - wight0_w; CL_DTYPE wight1_w = 1.0 - wight0_w;
float wight1_h = 1.0 - wight0_h; CL_DTYPE wight1_h = 1.0 - wight0_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP |
...@@ -92,5 +92,6 @@ __kernel void bilinear_interp(__read_only image2d_t input, ...@@ -92,5 +92,6 @@ __kernel void bilinear_interp(__read_only image2d_t input,
CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
+ (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h; + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out);
} }
...@@ -29,12 +29,12 @@ CLRuntime::~CLRuntime() { ...@@ -29,12 +29,12 @@ CLRuntime::~CLRuntime() {
command_queue_->flush(); command_queue_->flush();
command_queue_->finish(); command_queue_->finish();
} }
// For controlling the destruction order: // For controlling the destruction order
command_queue_.reset(); command_queue_.reset();
context_.reset(); context_.reset();
device_.reset(); device_.reset();
platform_.reset(); platform_.reset();
LOG(INFO) << "release ~CLRuntime() "; device_info_.clear();
} }
bool CLRuntime::Init() { bool CLRuntime::Init() {
......
...@@ -55,7 +55,7 @@ class CLRuntime { ...@@ -55,7 +55,7 @@ class CLRuntime {
std::map<std::string, size_t>& GetDeviceInfo(); std::map<std::string, size_t>& GetDeviceInfo();
private: private:
CLRuntime() = default; CLRuntime() { Init(); }
~CLRuntime(); ~CLRuntime();
......
if(NOT LITE_WITH_RKNPU)
return()
endif()
lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs})
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/rknpu/device.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace rknpu {
std::unique_ptr<rk::nn::Exection> Device::Build(
std::string& model_name, // NOLINT
rk::nn::Graph* rk_graph, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes // NOLINT
) {
VLOG(3) << "[RKNPU] Build model";
rk_graph->SetInputsOutputs(input_nodes, output_nodes);
std::unique_ptr<rk::nn::Exection> exector =
std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(rk_graph));
exector->Build();
return exector;
}
} // namespace rknpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "rknpu/rknpu_pub.h" // NOLINT
namespace paddle {
namespace lite {
namespace rknpu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() {}
// Build the RK IR graph to om model, return RK model exector to
// load om model and run inference.
std::unique_ptr<rk::nn::Exection> Build(
std::string& model_name, // NOLINT
rk::nn::Graph* rk_graph, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes // NOLINT
); // NOLINT
private:
};
} // namespace rknpu
} // namespace lite
} // namespace paddle
...@@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL) ...@@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
endif(LITE_ON_MODEL_OPTIMIZE_TOOL) endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags) lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml) lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak) lite_cc_library(x86_cpu_info SRCS cpu_info.cc)
add_subdirectory(jit) add_subdirectory(jit)
add_subdirectory(math) add_subdirectory(math)
...@@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() { ...@@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() {
void* GetMKLMLDsoHandle() { void* GetMKLMLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__) #if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib"); return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib");
#elif defined(_WIN32) #elif defined(_WIN32)
return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll"); return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
#else #else
......
...@@ -40,7 +40,7 @@ void MatMulJitCode::genCode() { ...@@ -40,7 +40,7 @@ void MatMulJitCode::genCode() {
for (size_t g = 0; g < groups.size(); ++g) { for (size_t g = 0; g < groups.size(); ++g) {
size_t x_offset = 0; size_t x_offset = 0;
size_t wgt_offset_tmp = 0; size_t wgt_offset_tmp = 0;
for (int i = 0; i < g; ++i) { for (size_t i = 0; i < g; ++i) {
wgt_offset_tmp += groups[i] * block_len; wgt_offset_tmp += groups[i] * block_len;
} }
for (int k = 0; k < k_; ++k) { for (int k = 0; k < k_; ++k) {
......
...@@ -28,6 +28,12 @@ ...@@ -28,6 +28,12 @@
#define posix_memalign_free free #define posix_memalign_free free
#endif #endif
#ifdef _WIN32
#define posix_memalign_free _aligned_free
#define posix_memalign(p, a, s) \
(((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
#endif
// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode"); bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
...@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const { ...@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
void* GenBase::operator new(size_t size) { void* GenBase::operator new(size_t size) {
void* ptr; void* ptr;
constexpr size_t alignment = 32ul; constexpr size_t alignment = 32ul;
#ifdef _WIN32
ptr = _aligned_malloc(size, alignment);
#else
PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
0, 0,
"GenBase Alloc %ld error!", "GenBase Alloc %ld error!",
size); size);
#endif
PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
return ptr; return ptr;
} }
......
...@@ -265,7 +265,7 @@ class BeamSearchFunctor<TARGET(kX86), T> { ...@@ -265,7 +265,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
// size_t num_seqs = scores->NumElements(lod_level); // size_t num_seqs = scores->NumElements(lod_level);
size_t num_seqs = scores->lod()[lod_level].size() - 1; size_t num_seqs = scores->lod()[lod_level].size() - 1;
size_t seq_width = 1; size_t seq_width = 1;
for (int i = 1; i < scores->dims().size(); i++) { for (size_t i = 1; i < scores->dims().size(); i++) {
seq_width *= scores->dims()[i]; seq_width *= scores->dims()[i];
} }
......
...@@ -23,7 +23,7 @@ namespace math { ...@@ -23,7 +23,7 @@ namespace math {
MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
int num_flatten_cols, int num_flatten_cols,
bool trans) { bool trans) {
PADDLE_ENFORCE_GT(tensor_dim.size(), 1); PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
MatDescriptor retv; MatDescriptor retv;
if (num_flatten_cols > 1) { if (num_flatten_cols > 1) {
auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
......
...@@ -46,9 +46,9 @@ class MaxSeqPoolFunctor { ...@@ -46,9 +46,9 @@ class MaxSeqPoolFunctor {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto out_dims = output->dims(); auto out_dims = output->dims();
auto idx_dims = index->dims(); auto idx_dims = index->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1); PADDLE_ENFORCE_GT(in_dims.size(), 1u);
PADDLE_ENFORCE_GT(out_dims.size(), 1); PADDLE_ENFORCE_GT(out_dims.size(), 1u);
for (int64_t i = 1; i < in_dims.size(); ++i) { for (size_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
} }
PADDLE_ENFORCE_EQ(idx_dims, out_dims); PADDLE_ENFORCE_EQ(idx_dims, out_dims);
...@@ -95,9 +95,9 @@ class MaxSeqPoolFunctor<T, true> { ...@@ -95,9 +95,9 @@ class MaxSeqPoolFunctor<T, true> {
lite::Tensor* index) { lite::Tensor* index) {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto out_dims = output->dims(); auto out_dims = output->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1); PADDLE_ENFORCE_GT(in_dims.size(), 1u);
PADDLE_ENFORCE_GT(out_dims.size(), 1); PADDLE_ENFORCE_GT(out_dims.size(), 1u);
for (int64_t i = 1; i < in_dims.size(); ++i) { for (size_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
} }
...@@ -138,7 +138,7 @@ class MaxSeqPoolGradFunctor { ...@@ -138,7 +138,7 @@ class MaxSeqPoolGradFunctor {
auto idx_dims = index.dims(); auto idx_dims = index.dims();
PADDLE_ENFORCE_GT(og_dims.size(), 1); PADDLE_ENFORCE_GT(og_dims.size(), 1);
PADDLE_ENFORCE_GT(ig_dims.size(), 1); PADDLE_ENFORCE_GT(ig_dims.size(), 1);
for (int64_t i = 1; i < og_dims.size(); ++i) { for (size_t i = 1; i < og_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
} }
PADDLE_ENFORCE_EQ(idx_dims, og_dims); PADDLE_ENFORCE_EQ(idx_dims, og_dims);
......
...@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() { ...@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() {
// Do not support nested omp parallem. // Do not support nested omp parallem.
num_threads = omp_in_parallel() ? 1 : omp_get_max_threads(); num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
#endif #endif
return std::max(num_threads, 1L); return std::max<int>(num_threads, 1L);
} }
using ThreadHandler = using ThreadHandler =
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#pragma once #pragma once
#include <time.h>
#include <cstdio> #include <cstdio>
#include <stdexcept> #include <stdexcept>
#include <time.h>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -37,7 +37,9 @@ ...@@ -37,7 +37,9 @@
#define GOOGLE_GLOG_DLL_DECL #define GOOGLE_GLOG_DLL_DECL
#include <io.h> // _popen, _pclose #include <io.h> // _popen, _pclose
#include <stdio.h> #include <stdio.h>
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h> #include <windows.h>
#include <winsock.h>
#include <numeric> // std::accumulate in msvc #include <numeric> // std::accumulate in msvc
#ifndef S_ISDIR // windows port for sys/stat.h #ifndef S_ISDIR // windows port for sys/stat.h
#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
...@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) { ...@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) {
return reinterpret_cast<void *>(hModule); return reinterpret_cast<void *>(hModule);
} }
extern struct timeval;
static int gettimeofday(struct timeval *tp, void *tzp) { static int gettimeofday(struct timeval *tp, void *tzp) {
time_t clock; time_t clock;
struct tm tm; struct tm tm;
......
...@@ -24,13 +24,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -24,13 +24,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
endif() endif()
if (LITE_WITH_X86)
lite_cc_library(variable SRCS variable.cc DEPS tensor) lite_cc_library(variable SRCS variable.cc DEPS tensor)
lite_cc_library(types SRCS types.cc) lite_cc_library(types SRCS types.cc)
else()
lite_cc_library(variable SRCS variable.cc DEPS tensor)
lite_cc_library(types SRCS types.cc)
endif()
lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel) lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel)
lite_cc_library(scope SRCS scope.cc DEPS tensor) lite_cc_library(scope SRCS scope.cc DEPS tensor)
lite_cc_library(device_info SRCS device_info.cc DEPS tensor) lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
...@@ -38,7 +33,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor) ...@@ -38,7 +33,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
if (LITE_WITH_ARM) if (LITE_WITH_ARM)
lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context) lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context)
else() else()
lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context) lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context CUDA_DEPS cuda_context)
endif() endif()
#-------------------------------------------- GET CODE META INFO ------------------------------------------ #-------------------------------------------- GET CODE META INFO ------------------------------------------
......
...@@ -6,5 +6,5 @@ endif() ...@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest) lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM)) if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels}) lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif() endif()
...@@ -107,7 +107,7 @@ void TestCase::PrepareInputsForInstruction() { ...@@ -107,7 +107,7 @@ void TestCase::PrepareInputsForInstruction() {
CHECK(!shared_tensor_array->empty()) CHECK(!shared_tensor_array->empty())
<< "shared_tensor_array is empty yet"; << "shared_tensor_array is empty yet";
target_tensor_array->resize(shared_tensor_array->size()); target_tensor_array->resize(shared_tensor_array->size());
for (int i = 0; i < shared_tensor_array->size(); i++) { for (size_t i = 0; i < shared_tensor_array->size(); i++) {
target_tensor_array->at(i).Resize( target_tensor_array->at(i).Resize(
shared_tensor_array->at(i).dims()); shared_tensor_array->at(i).dims());
TargetCopy(param_type->type->target(), TargetCopy(param_type->type->target(),
...@@ -219,7 +219,7 @@ bool TestCase::CheckPrecision(const std::string& var_name, ...@@ -219,7 +219,7 @@ bool TestCase::CheckPrecision(const std::string& var_name,
auto b_tensor_array = auto b_tensor_array =
base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>(); base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
CHECK_EQ(a_tensor_array->size(), b_tensor_array->size()); CHECK_EQ(a_tensor_array->size(), b_tensor_array->size());
for (int i = 0; i < a_tensor_array->size(); i++) { for (size_t i = 0; i < a_tensor_array->size(); i++) {
Tensor* a_tensor = &(a_tensor_array->at(i)); Tensor* a_tensor = &(a_tensor_array->at(i));
Tensor* b_tensor = &(b_tensor_array->at(i)); Tensor* b_tensor = &(b_tensor_array->at(i));
if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) { if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) {
......
...@@ -166,7 +166,7 @@ class TestCase { ...@@ -166,7 +166,7 @@ class TestCase {
// TODO(Superjomn) Move this method to utils or DDim? // TODO(Superjomn) Move this method to utils or DDim?
bool ShapeEquals(const DDim& a, const DDim& b) { bool ShapeEquals(const DDim& a, const DDim& b) {
if (a.size() != b.size()) return false; if (a.size() != b.size()) return false;
for (int i = 0; i < a.size(); i++) { for (size_t i = 0; i < a.size(); i++) {
if (a[i] != b[i]) return false; if (a[i] != b[i]) return false;
} }
return true; return true;
......
...@@ -16,8 +16,7 @@ ...@@ -16,8 +16,7 @@
#include "lite/utils/any.h" #include "lite/utils/any.h"
#ifdef LITE_WITH_CUDA #ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/blas.h" #include "lite/backends/cuda/context.h"
#include "lite/backends/cuda/cuda_utils.h"
#endif #endif
#ifdef LITE_WITH_OPENCL #ifdef LITE_WITH_OPENCL
#include <unordered_map> #include <unordered_map>
...@@ -53,14 +52,15 @@ class Context; ...@@ -53,14 +52,15 @@ class Context;
using HostContext = Context<TargetType::kHost>; using HostContext = Context<TargetType::kHost>;
using X86Context = Context<TargetType::kX86>; using X86Context = Context<TargetType::kX86>;
using CUDAContext = Context<TargetType::kCUDA>;
using ARMContext = Context<TargetType::kARM>; using ARMContext = Context<TargetType::kARM>;
using NPUContext = Context<TargetType::kNPU>; using NPUContext = Context<TargetType::kNPU>;
using APUContext = Context<TargetType::kAPU>;
using XPUContext = Context<TargetType::kXPU>; using XPUContext = Context<TargetType::kXPU>;
using OpenCLContext = Context<TargetType::kOpenCL>; using OpenCLContext = Context<TargetType::kOpenCL>;
using FPGAContext = Context<TargetType::kFPGA>; using FPGAContext = Context<TargetType::kFPGA>;
using BMContext = Context<TargetType::kBM>; using BMContext = Context<TargetType::kBM>;
using MLUContext = Context<TargetType::kMLU>; using MLUContext = Context<TargetType::kMLU>;
using RKNPUContext = Context<TargetType::kRKNPU>;
template <> template <>
class Context<TargetType::kHost> { class Context<TargetType::kHost> {
...@@ -88,6 +88,21 @@ class Context<TargetType::kNPU> { ...@@ -88,6 +88,21 @@ class Context<TargetType::kNPU> {
}; };
#endif #endif
#ifdef LITE_WITH_APU
template <>
class Context<TargetType::kAPU> {
public:
Context() {}
explicit Context(const APUContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(APUContext* ctx) {}
APUContext& operator=(const APUContext& ctx) {}
std::string name() const { return "APUContext"; }
};
#endif
#ifdef LITE_WITH_BM #ifdef LITE_WITH_BM
template <> template <>
class Context<TargetType::kBM> { class Context<TargetType::kBM> {
...@@ -105,6 +120,21 @@ class Context<TargetType::kBM> { ...@@ -105,6 +120,21 @@ class Context<TargetType::kBM> {
}; };
#endif #endif
#ifdef LITE_WITH_RKNPU
template <>
class Context<TargetType::kRKNPU> {
public:
Context() {}
explicit Context(const RKNPUContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(RKNPUContext* ctx) {}
RKNPUContext& operator=(const RKNPUContext& ctx) {}
std::string name() const { return "RKNPUContext"; }
};
#endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
template <> template <>
class Context<TargetType::kXPU> { class Context<TargetType::kXPU> {
...@@ -286,103 +316,6 @@ class Context<TargetType::kMLU> { ...@@ -286,103 +316,6 @@ class Context<TargetType::kMLU> {
}; };
#endif // LITE_WITH_MLU #endif // LITE_WITH_MLU
#ifdef LITE_WITH_CUDA
// Only works with CUDA kernels.
template <>
class Context<TargetType::kCUDA> {
public:
typename Env<TargetType::kCUDA>::Devs& devs =
Env<TargetType::kCUDA>::Global();
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {
if (devs.size() > 0) {
cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
} else {
LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
}
}
void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
CHECK_GT(devs.size(), 0UL)
<< "Env is not initialized or current target is not exit!";
if (dev_id >= static_cast<int>(devs.size())) {
LOG(WARNING) << "device index exceeds the number of devices, set to "
"default device(0)!";
device_id_ = 0;
} else {
device_id_ = dev_id;
}
if (io_stream_id >= devs[dev_id].max_stream()) {
LOG(WARNING) << "data stream index exceeds the maximum stream number, "
"set to default stream(0)!";
io_stream_id = 0;
}
if (exec_stream_id >= devs[dev_id].max_stream()) {
LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
"set to default stream(0)!";
exec_stream_id = 0;
}
exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
io_stream_ = devs[dev_id].io_streams()[io_stream_id];
exec_stream_id_ = exec_stream_id;
io_stream_id_ = io_stream_id;
}
void CopySharedTo(CUDAContext* ctx) {
CHECK(ctx);
CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
ctx->cublas_fp32_ = cublas_fp32_;
}
const cudaStream_t& exec_stream() const { return exec_stream_; }
void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
const cudaStream_t& io_stream() const { return io_stream_; }
void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
cublas_fp32_ = cublas_fp32;
}
const std::vector<cudaEvent_t>& input_events() { return input_events_; }
void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
input_events_.clear();
input_events_.assign(input_events.begin(), input_events.end());
}
const std::vector<cudaEvent_t>& output_events() { return output_events_; }
void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
output_events_.clear();
output_events_.assign(output_events.begin(), output_events.end());
}
std::string name() const { return "CUDAContext"; }
CUDAContext& operator=(const CUDAContext& context) {
this->Init(
context.device_id_, context.exec_stream_id_, context.io_stream_id_);
cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
return *this;
}
private:
int device_id_;
// overall information
int exec_stream_id_;
int io_stream_id_;
cudaStream_t exec_stream_;
cudaStream_t io_stream_;
// not thread-safe, should allocate for each thread.
std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
// kernel information
std::vector<cudaEvent_t> input_events_;
std::vector<cudaEvent_t> output_events_;
};
#endif
#ifdef LITE_WITH_X86 #ifdef LITE_WITH_X86
template <> template <>
class Context<TargetType::kX86> { class Context<TargetType::kX86> {
...@@ -455,7 +388,9 @@ class ContextScheduler { ...@@ -455,7 +388,9 @@ class ContextScheduler {
return *x; return *x;
} }
std::unique_ptr<KernelContext> NewContext(TargetType target) { std::unique_ptr<KernelContext> NewContext(
TargetType target,
/*only used for cuda context*/ int exec_stream_id = 0) {
std::unique_ptr<KernelContext> ctx(new KernelContext); std::unique_ptr<KernelContext> ctx(new KernelContext);
switch (target) { switch (target) {
case TARGET(kHost): case TARGET(kHost):
...@@ -472,7 +407,7 @@ class ContextScheduler { ...@@ -472,7 +407,7 @@ class ContextScheduler {
case TARGET(kCUDA): { case TARGET(kCUDA): {
int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice(); int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
auto& context = ctx->As<CUDAContext>(); auto& context = ctx->As<CUDAContext>();
context.Init(dev_id); context.Init(dev_id, exec_stream_id);
kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopySharedTo( kernel_contexts_[TargetType::kCUDA].As<CUDAContext>().CopySharedTo(
&context); &context);
} break; } break;
...@@ -489,6 +424,18 @@ class ContextScheduler { ...@@ -489,6 +424,18 @@ class ContextScheduler {
&ctx->As<NPUContext>()); &ctx->As<NPUContext>());
break; break;
#endif #endif
#ifdef LITE_WITH_APU
case TARGET(kAPU):
kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
&ctx->As<APUContext>());
break;
#endif
#ifdef LITE_WITH_RKNPU
case TARGET(kRKNPU):
kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
&ctx->As<RKNPUContext>());
break;
#endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
case TARGET(kXPU): case TARGET(kXPU):
kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo( kernel_contexts_[TargetType::kXPU].As<XPUContext>().CopySharedTo(
...@@ -558,6 +505,12 @@ class ContextScheduler { ...@@ -558,6 +505,12 @@ class ContextScheduler {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
InitContext<TargetType::kNPU, NPUContext>(); InitContext<TargetType::kNPU, NPUContext>();
#endif #endif
#ifdef LITE_WITH_APU
InitContext<TargetType::kAPU, APUContext>();
#endif
#ifdef LITE_WITH_RKNPU
InitContext<TargetType::kRKNPU, RKNPUContext>();
#endif
#ifdef LITE_WITH_XPU #ifdef LITE_WITH_XPU
InitContext<TargetType::kXPU, XPUContext>(); InitContext<TargetType::kXPU, XPUContext>();
#endif #endif
......
...@@ -947,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) { ...@@ -947,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) {
active_ids_ = core_ids_; active_ids_ = core_ids_;
} else { } else {
active_ids_.resize(thread_num); active_ids_.resize(thread_num);
for (int i = 0; i < thread_num; ++i) { for (uint32_t i = 0; i < thread_num; ++i) {
if (i < big_core_ids_.size()) { if (i < big_core_ids_.size()) {
active_ids_[i] = big_core_ids_[i]; active_ids_[i] = big_core_ids_[i];
} else { } else {
......
...@@ -159,7 +159,7 @@ class Env { ...@@ -159,7 +159,7 @@ class Env {
static Devs* devs = new Devs(); static Devs* devs = new Devs();
return *devs; return *devs;
} }
static void Init(int max_stream = 4) { static void Init(int max_stream = 6) {
#ifdef LITE_WITH_MLU #ifdef LITE_WITH_MLU
CNRT_CALL(cnrtInit(0)); CNRT_CALL(cnrtInit(0));
#endif #endif
...@@ -175,6 +175,7 @@ class Env { ...@@ -175,6 +175,7 @@ class Env {
} else { } else {
LOG(INFO) << "Found " << count << " device(s)"; LOG(INFO) << "Found " << count << " device(s)";
} }
CHECK_GT(max_stream, 0) << "max_stream must be greater than 0.";
// create all device // create all device
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
auto dev = Device<Type>(i, max_stream); auto dev = Device<Type>(i, max_stream);
...@@ -234,8 +235,8 @@ class Device<TARGET(kCUDA)> { ...@@ -234,8 +235,8 @@ class Device<TARGET(kCUDA)> {
std::string name() { return device_prop_.name; } std::string name() { return device_prop_.name; }
int core_num() { return device_prop_.multiProcessorCount; } int core_num() { return device_prop_.multiProcessorCount; }
float max_memory() { return device_prop_.totalGlobalMem / 1048576.; } float max_memory() { return device_prop_.totalGlobalMem / 1048576.; }
std::vector<cudaStream_t> exec_streams() { return exec_stream_; } const std::vector<cudaStream_t>& exec_streams() { return exec_stream_; }
std::vector<cudaStream_t> io_streams() { return io_stream_; } const std::vector<cudaStream_t>& io_streams() { return io_stream_; }
int sm_version() { return sm_version_; } int sm_version() { return sm_version_; }
bool has_fp16() { return has_fp16_; } bool has_fp16() { return has_fp16_; }
......
...@@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type, ...@@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type,
std::string *alias, std::string *alias,
Place *place) { Place *place) {
auto parts = Split(kernel_type, "/"); auto parts = Split(kernel_type, "/");
CHECK_EQ(parts.size(), 5); CHECK_EQ(parts.size(), 5u);
*op_type = parts[0]; *op_type = parts[0];
*alias = parts[1]; *alias = parts[1];
......
...@@ -37,6 +37,7 @@ lite_cc_library(mir_passes ...@@ -37,6 +37,7 @@ lite_cc_library(mir_passes
demo_pass.cc demo_pass.cc
runtime_context_assign_pass.cc runtime_context_assign_pass.cc
memory_optimize_pass.cc memory_optimize_pass.cc
multi_stream_analysis_pass.cc
mlu_postprocess_pass.cc mlu_postprocess_pass.cc
weight_quantization_preprocess_pass.cc weight_quantization_preprocess_pass.cc
quantized_op_attributes_inference_pass.cc quantized_op_attributes_inference_pass.cc
......
...@@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -116,8 +116,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
} }
size_t weight_num = conv_weight_t->data_size(); size_t weight_num = conv_weight_t->data_size();
bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false; bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
bool is_weight_quantization = bool is_weight_quantization = conv_op_desc->HasAttr("quantize_weight_bits");
conv_op_desc->HasAttr("quantize_weight_bits") ? true : false;
// comupte BN alpha and beta // comupte BN alpha and beta
Tensor alpha_tensor, beta_tensor; Tensor alpha_tensor, beta_tensor;
...@@ -164,23 +163,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -164,23 +163,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3]; conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) { for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
for (unsigned int i = 0; i < h; ++i) { for (int i = 0; i < h; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]); weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) { if (alpha_data[i] < 0.f) {
auto ptr_row = conv_weight_d + k * c_size + i * hw; auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (unsigned int j = 0; j < hw; ++j) { for (int j = 0; j < hw; ++j) {
ptr_row[j] *= -1; ptr_row[j] *= -1;
} }
} }
} }
} }
} else { } else {
for (unsigned int i = 0; i < h; ++i) { for (int i = 0; i < h; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]); weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) { if (alpha_data[i] < 0.f) {
auto ptr_row = conv_weight_d + i * w; auto ptr_row = conv_weight_d + i * w;
for (unsigned int j = 0; j < w; ++j) { for (int j = 0; j < w; ++j) {
ptr_row[j] *= -1; ptr_row[j] *= -1;
} }
} }
...@@ -204,17 +203,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -204,17 +203,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3]; conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) { for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
for (unsigned int i = 0; i < h; ++i) { for (int i = 0; i < h; ++i) {
auto ptr_row = conv_weight_d + k * c_size + i * hw; auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (unsigned int j = 0; j < hw; ++j) { for (int j = 0; j < hw; ++j) {
ptr_row[j] *= alpha_data[i]; ptr_row[j] *= alpha_data[i];
} }
} }
} }
} else { } else {
for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels for (int i = 0; i < h; ++i) { // n: conv2d output channels
for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels for (int j = 0; j < w; ++j) { // w: conv2d input channels
conv_weight_d[i * w + j] *= alpha_data[i]; conv_weight_d[i * w + j] *= alpha_data[i];
} }
} }
......
...@@ -260,7 +260,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -260,7 +260,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
auto channel_scale_tensor = auto channel_scale_tensor =
scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>(); scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>();
auto* channel_scale_data = channel_scale_tensor->data<float>(); auto* channel_scale_data = channel_scale_tensor->data<float>();
for (int i = 0; i < channel_scale_tensor->data_size(); i++) { for (size_t i = 0; i < channel_scale_tensor->data_size(); i++) {
weight_scale.push_back(channel_scale_data[i] / range); weight_scale.push_back(channel_scale_data[i] / range);
} }
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "lite/core/mir/generate_program_pass.h" #include "lite/core/mir/generate_program_pass.h"
#include <memory> #include <memory>
#include <string>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "lite/core/mir/graph_visualize_pass.h" #include "lite/core/mir/graph_visualize_pass.h"
...@@ -25,10 +26,37 @@ namespace mir { ...@@ -25,10 +26,37 @@ namespace mir {
void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void GenerateProgramPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
VLOG(4) << "final program \n" << Visualize(graph.get()); VLOG(4) << "final program \n" << Visualize(graph.get());
for (auto& item : graph->StmtTopologicalOrder()) { std::vector<Node*> nodes_in_order;
#ifdef LITE_WITH_CUDA
const std::string depend_pass = "multi_stream_analysis_pass";
const std::string attr_name = "nodes_in_order";
mir::Pass* pass = mir::PassManager::Global().LookUp(depend_pass);
if (pass->HasAttr(attr_name)) {
nodes_in_order = pass->GetAttr<std::vector<Node*>>(attr_name);
}
#endif
if (nodes_in_order.empty()) {
nodes_in_order = graph->StmtTopologicalOrder();
}
for (auto& item : nodes_in_order) {
if (item->IsStmt()) { if (item->IsStmt()) {
auto& stmt = item->AsStmt(); auto& stmt = item->AsStmt();
VLOG(4) << stmt; VLOG(4) << stmt;
#ifdef LITE_WITH_CUDA
if (stmt.kernels().front()->target() == TargetType::kCUDA) {
stmt.kernels()
.front()
->mutable_context()
->As<CUDAContext>()
.SetNeedSync(stmt.need_sync_);
stmt.kernels()
.front()
->mutable_context()
->As<CUDAContext>()
.SetSyncStreams(stmt.sync_streams_);
}
#endif
insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front())); insts_.emplace_back(stmt.op(), std::move(stmt.kernels().front()));
} }
} }
......
...@@ -85,7 +85,23 @@ std::string Visualize(mir::SSAGraph* graph) { ...@@ -85,7 +85,23 @@ std::string Visualize(mir::SSAGraph* graph) {
if (!node->IsStmt()) continue; if (!node->IsStmt()) continue;
auto op_info = node->AsStmt().op_info(); auto op_info = node->AsStmt().op_info();
auto op_type = op_info->Type(); auto op_type = op_info->Type();
std::string op_name = string_format("%s%d", op_type.c_str(), op_idx++); std::string op_name;
if (node->AsStmt().need_sync_) {
std::ostringstream oss;
for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) {
oss << std::to_string(node->AsStmt().sync_streams_[i]);
if (i != node->AsStmt().sync_streams_.size() - 1) {
oss << ",";
}
}
op_name = string_format("%s%d, stream=%d, sync_streams={%s}",
op_type.c_str(),
op_idx++,
node->AsStmt().stream_id_,
oss.str().c_str());
} else {
op_name = string_format("%s%d", op_type.c_str(), op_idx++);
}
// Add its input&output variables as the Dot nodes // Add its input&output variables as the Dot nodes
dot.AddNode(op_name, dot.AddNode(op_name,
{Dot::Attr("shape", "box"), {Dot::Attr("shape", "box"),
...@@ -93,7 +109,13 @@ std::string Visualize(mir::SSAGraph* graph) { ...@@ -93,7 +109,13 @@ std::string Visualize(mir::SSAGraph* graph) {
Dot::Attr("color", "black"), Dot::Attr("color", "black"),
Dot::Attr("fillcolor", "yellow")}); Dot::Attr("fillcolor", "yellow")});
for (auto& x : node->inlinks) { for (auto& x : node->inlinks) {
auto var_name = x->AsArg().name; std::string var_name;
if (x->AsArg().lane != -1) {
var_name = string_format(
"%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane);
} else {
var_name = x->AsArg().name;
}
if (!exists_var_names.count(var_name)) { if (!exists_var_names.count(var_name)) {
dot.AddNode(var_name, {}); dot.AddNode(var_name, {});
exists_var_names.insert(var_name); exists_var_names.insert(var_name);
...@@ -101,7 +123,13 @@ std::string Visualize(mir::SSAGraph* graph) { ...@@ -101,7 +123,13 @@ std::string Visualize(mir::SSAGraph* graph) {
dot.AddEdge(var_name, op_name, {}); dot.AddEdge(var_name, op_name, {});
} }
for (auto& x : node->outlinks) { for (auto& x : node->outlinks) {
auto var_name = x->AsArg().name; std::string var_name;
if (x->AsArg().lane != -1) {
var_name = string_format(
"%s, lane=%d", x->AsArg().name.c_str(), x->AsArg().lane);
} else {
var_name = x->AsArg().name;
}
if (!exists_var_names.count(var_name)) { if (!exists_var_names.count(var_name)) {
dot.AddNode(var_name, {}); dot.AddNode(var_name, {});
exists_var_names.insert(var_name); exists_var_names.insert(var_name);
......
...@@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
.BindTargets({TARGET(kARM), TARGET(kOpenCL)}) .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
.ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM)}); .ExcludeTargets({TARGET(kNPU),
TARGET(kXPU),
TARGET(kBM),
TARGET(kRKNPU),
TARGET(kAPU)});
...@@ -292,7 +292,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, ...@@ -292,7 +292,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
// get subgraph op's type info // get subgraph op's type info
size_t kernel_size = inst_node->AsStmt().kernels().size(); size_t kernel_size = inst_node->AsStmt().kernels().size();
CHECK_GT(kernel_size, 0); CHECK_GT(kernel_size, 0u);
VLOG(4) << "subgraph kernel size: " << kernel_size; VLOG(4) << "subgraph kernel size: " << kernel_size;
for (size_t i = 0; i < kernel_size; ++i) { for (size_t i = 0; i < kernel_size; ++i) {
...@@ -450,7 +450,7 @@ bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) { ...@@ -450,7 +450,7 @@ bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
auto* block_desc = auto* block_desc =
static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get()) static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
->GetSubBlock(); ->GetSubBlock();
for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) { for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx); auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
CHECK(op_desc); CHECK(op_desc);
if (op_desc->Type() == "conv2d") { if (op_desc->Type() == "conv2d") {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/mir/multi_stream_analysis_pass.h"
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "lite/core/device_info.h"
#include "lite/core/mir/graph_visualize_pass.h"
#include "lite/core/mir/pass_registry.h"
#include "lite/core/type_system.h"
namespace paddle {
namespace lite {
namespace mir {
void MultiStreamAnalysisPass::CleanUp() {
exec_ops_.clear();
wait_que_.clear();
wait_que_cpu_.clear();
std::queue<int> empty_queue;
while (!exec_que_.empty()) {
exec_que_.pop();
}
ops_in_streams_.clear();
resources_.clear();
map_arg_to_lane_.clear();
op_types_set_.clear();
io_copy_once_num_ = 0;
}
void MultiStreamAnalysisPass::Init(SSAGraph* graph) {
// If not cleaned, the clone will overlay the previous state
CleanUp();
for (auto& op_node : graph->StmtTopologicalOrder()) {
if (op_node->IsStmt()) {
// Set all outputs of op to inaccessible state.
auto outputs = op_node->outlinks;
for (Node* node : outputs) {
CHECK(node->IsArg());
auto& arg = node->AsArg();
if (!resources_.count(arg.name)) {
resources_[arg.name] = false;
}
}
// Set the weight input of op to be accessible.
auto inputs = op_node->inlinks;
for (Node* node : inputs) {
CHECK(node->IsArg());
auto& arg = node->AsArg();
if (arg.is_weight || arg.is_persist) {
resources_[arg.name] = true;
}
}
// feed and io_copy_once op has no dependencies and can be launched
// directly. Other ops are put into the waiting queue.
if (op_node->AsStmt().op_type() == "feed" ||
op_node->AsStmt().op_type() == "io_copy_once") {
exec_que_.push(op_node);
} else {
auto tgt = op_node->AsStmt().kernels().front()->target();
if (tgt == TargetType::kCUDA) {
wait_que_.push_back(op_node);
} else {
wait_que_cpu_.push_back(op_node);
}
}
op_types_set_.insert(op_node->AsStmt().op_type());
}
}
// Set the stream id according to the number of feed ops, and set the output
// of the feed op to be accessible.
int lane = 0;
auto nodes = graph->inputs();
ops_in_streams_.resize(max_stream_);
for (auto& node : nodes) {
std::string::size_type idx = node->AsArg().name.find("feed");
if (idx != std::string::npos) {
for (auto& feed_ops : node->outlinks) {
if (feed_ops->AsStmt().op_type() == "feed") {
// feed op doesn't need to wait sync.
feed_ops->AsStmt().need_sync_ = false;
CHECK_EQ(static_cast<int>(feed_ops->outlinks.size()), 1)
<< "feed op must have one output.";
for (auto& var : feed_ops->outlinks) {
var->AsArg().lane = lane;
map_arg_to_lane_[var->AsArg().name] = lane;
resources_[var->AsArg().name] = true;
}
feed_ops->AsStmt().stream_id_ = lane;
ops_in_streams_[lane].push_back(feed_ops);
++lane;
if (lane >= max_stream_) {
lane = 0;
}
}
}
}
// set all io_copy_once op in the first stream
for (auto& io_copy_once_ops : node->outlinks) {
if (io_copy_once_ops->AsStmt().op_type() == "io_copy_once") {
ops_in_streams_[0].push_back(io_copy_once_ops);
io_copy_once_ops->AsStmt().stream_id_ = 0;
io_copy_once_ops->AsStmt().need_sync_ = false;
++io_copy_once_num_;
}
}
}
}
bool MultiStreamAnalysisPass::CheckOpSupport() {
std::unordered_set<std::string> invalid_op = {
"while", "conditional_block", "conditional_block_infer", "graph_op"};
for (auto& op_type : op_types_set_) {
if (invalid_op.count(op_type)) {
LOG(INFO) << "multi_stream_analysis_pass don't support " << op_type
<< ", just return.";
return false;
}
}
return true;
}
bool MultiStreamAnalysisPass::IsPrepared(Node* stmt_node) {
// feed op are prepared when init.
std::string op_name = stmt_node->AsStmt().op_type();
if (op_name == "feed") {
return true;
}
// Check is op's input are all accessible.
std::vector<std::string> args;
for (auto* ins : stmt_node->inlinks) {
args.push_back(ins->AsArg().name);
}
return CheckAccess(args);
}
bool MultiStreamAnalysisPass::CheckAccess(
const std::vector<std::string>& args) {
if (args.size() == 0) {
return true;
}
for (auto& name : args) {
if (resources_[name]) {
continue;
} else {
return false;
}
}
return true;
}
int MultiStreamAnalysisPass::SelectStreamId(const std::vector<int>& lanes) {
if (lanes.size() == 0) {
return 0;
}
int res = lanes[0];
int exclude_io_copy_once_num = ops_in_streams_[0].size() - io_copy_once_num_;
int min_num = lanes[0] == 0 ? exclude_io_copy_once_num
: ops_in_streams_[lanes[0]].size();
for (size_t i = 1; i < lanes.size(); ++i) {
int ith_num = lanes[i] == 0 ? exclude_io_copy_once_num
: ops_in_streams_[lanes[i]].size();
if (ith_num < min_num) {
res = lanes[i];
min_num = ith_num;
}
}
return res;
}
void MultiStreamAnalysisPass::Launch(Node* stmt_node) {
// record ops launch order.
exec_que_.push(stmt_node);
std::vector<int> lanes;
for (auto& in_arg : stmt_node->inlinks) {
// Weight parameter does not involve stream id, so just skip it.
if (in_arg->AsArg().is_weight || in_arg->AsArg().is_persist) {
continue;
}
if (std::find(lanes.begin(), lanes.end(), in_arg->AsArg().lane) ==
lanes.end()) {
lanes.push_back(in_arg->AsArg().lane);
}
}
int stream_id = SelectStreamId(lanes);
// If all inputs of the op are on multiple streams, they need to be
// synchronized
if (lanes.size() > 1) {
for (size_t i = 0; i < lanes.size(); ++i) {
if (lanes[i] != stream_id) {
stmt_node->AsStmt().sync_streams_.push_back(lanes[i]);
}
}
stmt_node->AsStmt().need_sync_ = true;
}
// io_copy are nodes inserted across devices and need to be synced.
if (stmt_node->AsStmt().op_type() == "io_copy") {
stmt_node->AsStmt().need_sync_ = true;
}
stmt_node->AsStmt().stream_id_ = stream_id;
// set output lane and set the output of op to be accessible.
for (auto& out_arg : stmt_node->outlinks) {
out_arg->AsArg().lane = stream_id;
resources_[out_arg->AsArg().name] = true;
}
ops_in_streams_[stream_id].push_back(stmt_node);
}
void MultiStreamAnalysisPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
#ifdef LITE_WITH_CUDA
typename Env<TargetType::kCUDA>::Devs& devs =
Env<TargetType::kCUDA>::Global();
int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
max_stream_ = devs[dev_id].max_stream();
#else
LOG(FATAL) << "Please re-compile by setting the cmake flag LITE_WITH_CUDA=ON";
#endif
// Find the correct startup sequence for op.
Init(graph.get());
bool is_valid = CheckOpSupport();
if (!is_valid) {
return;
}
size_t prev_size;
while (!(this->wait_que_.empty() && this->wait_que_cpu_.empty())) {
prev_size = this->wait_que_.size() + this->wait_que_cpu_.size();
// launch the acessible cuda kernel and remove it from wait que.
for (auto it = this->wait_que_.begin(); it != this->wait_que_.end();) {
if (IsPrepared(*it)) {
Launch(*it);
it = wait_que_.erase(it);
} else {
++it;
}
}
// launch the accessible cpu kernel and remove it from wait que.
for (auto cpu_it = this->wait_que_cpu_.begin();
cpu_it != this->wait_que_cpu_.end();) {
if (IsPrepared(*cpu_it)) {
Launch(*cpu_it);
cpu_it = wait_que_cpu_.erase(cpu_it);
} else {
++cpu_it;
}
}
if (this->wait_que_.size() + this->wait_que_cpu_.size() == prev_size) {
LOG(FATAL) << "network topo error!";
}
}
// Get exec ops order.
while (!exec_que_.empty()) {
auto* node = exec_que_.front();
exec_ops_.push_back(node);
VLOG(4) << node->AsStmt().op_type()
<< " stream: " << node->AsStmt().stream_id_
<< ", sync: " << node->AsStmt().need_sync_;
if (node->AsStmt().need_sync_) {
for (size_t i = 0; i < node->AsStmt().sync_streams_.size(); ++i) {
VLOG(4) << " " << node->AsStmt().sync_streams_[i];
}
}
exec_que_.pop();
}
// Set attribute parameters, for passing parameters between passes
const std::string attr_name{"nodes_in_order"};
SetAttr<std::vector<Node*>>(attr_name, &exec_ops_);
LOG(INFO) << "stream " << 0 << " has "
<< ops_in_streams_[0].size() - io_copy_once_num_
<< " ops. (exclude io_copy_once).";
for (size_t i = 1; i < ops_in_streams_.size(); ++i) {
LOG(INFO) << "stream " << i << " has " << ops_in_streams_[i].size()
<< " ops.";
}
}
} // namespace mir
} // namespace lite
} // namespace paddle
REGISTER_MIR_PASS(multi_stream_analysis_pass,
paddle::lite::mir::MultiStreamAnalysisPass)
.BindTargets({TARGET(kCUDA)});
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <list>
#include <memory>
#include <queue>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include "lite/core/kernel.h"
#include "lite/core/mir/pass.h"
namespace paddle {
namespace lite {
namespace mir {
/*
* MultiStreamAnalysisPass will find the correct launch sequence for all ops.
* Ideally, the order should be multiple asynchronous ops and a small number of
* synchronous ops.
*/
class MultiStreamAnalysisPass : public StmtPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
private:
// Init resource list. Set all ops except feed to inaccessible state and set
// stream id according to the numer of inputs.
void Init(SSAGraph* graph);
// Clean state information of all member variables.
void CleanUp();
// After launching, unlock the output resources of op.
void Launch(Node* stmt_node);
// If all inputs of an op are accessible, the op is considered to be in the
// prepared state
bool IsPrepared(Node* stmt_node);
// Determine if all inputs of op are accessible.
bool CheckAccess(const std::vector<std::string>& args);
// The logic of selecting a stream:
// 1. Make the number of ops on each stream as close as possible.
// 2. The selected stream must be one of the streams contained in the input
// arg
int SelectStreamId(const std::vector<int>& lanes);
// Check if the model's ops are all supported. If you encounter unsupported
// ops, exit
bool CheckOpSupport();
private:
std::list<Node*> wait_que_;
std::list<Node*> wait_que_cpu_;
std::queue<Node*> exec_que_;
std::vector<Node*> exec_ops_;
std::vector<std::vector<Node*>> ops_in_streams_;
std::unordered_map<std::string, bool> resources_;
std::unordered_map<std::string, int> map_arg_to_lane_;
int max_stream_;
int io_copy_once_num_;
std::unordered_set<std::string> op_types_set_;
};
} // namespace mir
} // namespace lite
} // namespace paddle
...@@ -80,6 +80,12 @@ class Node { ...@@ -80,6 +80,12 @@ class Node {
// Description. // Description.
std::string desc; std::string desc;
// for cuda multi stream
bool need_sync_{false};
int stream_id_{0};
// streams which need to be sync. exclude stream_id_
std::vector<int> sync_streams_{};
}; };
struct Arg { struct Arg {
...@@ -93,6 +99,7 @@ class Node { ...@@ -93,6 +99,7 @@ class Node {
// if the need more than one tool operator(eg. io_copy layout calib), the // if the need more than one tool operator(eg. io_copy layout calib), the
// argument between them should be persist to make sure it's only run once // argument between them should be persist to make sure it's only run once
bool is_persist{false}; bool is_persist{false};
int lane{-1};
}; };
Arg& AsArg(const std::string& name, int id); Arg& AsArg(const std::string& name, int id);
......
...@@ -17,9 +17,11 @@ ...@@ -17,9 +17,11 @@
#include <set> #include <set>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include "lite/core/mir/node.h" #include "lite/core/mir/node.h"
#include "lite/core/mir/ssa_graph.h" #include "lite/core/mir/ssa_graph.h"
#include "lite/utils/varient.h"
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -121,6 +123,27 @@ class Pass { ...@@ -121,6 +123,27 @@ class Pass {
virtual ~Pass() = default; virtual ~Pass() = default;
bool HasAttr(const std::string& attr_name) const {
return pass_attrs_.count(attr_name) > 0;
}
// Set a pointer to the attribute. Specific pass itself takes ownership of the
// attribute.
template <typename AttrType>
void SetAttr(const std::string& attr_name, const AttrType* attr) {
VLOG(4) << "Setting the attribute " << attr_name << " for the pass "
<< name_;
pass_attrs_[attr_name].set<const AttrType>(*attr);
}
// Get a reference to the attribute previously set.
template <typename AttrType>
const AttrType& GetAttr(const std::string& attr_name) const {
CHECK(pass_attrs_.count(attr_name))
<< attr_name << " attr not register for pass " << name_;
return pass_attrs_.at(attr_name).get<const AttrType>();
}
private: private:
const Kind kind_; const Kind kind_;
std::string name_; std::string name_;
...@@ -128,6 +151,8 @@ class Pass { ...@@ -128,6 +151,8 @@ class Pass {
std::set<TargetType> bound_targets_; std::set<TargetType> bound_targets_;
std::set<TargetType> excluded_targets_; std::set<TargetType> excluded_targets_;
std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_; std::unordered_map<std::string, std::set<lite_api::Place>> bound_kernels_;
std::unordered_map<std::string, variant<Node, std::vector<Node*>>>
pass_attrs_;
}; };
// Different kinds. // Different kinds.
......
...@@ -59,6 +59,9 @@ class PassRegistry { ...@@ -59,6 +59,9 @@ class PassRegistry {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// some platform-independent defintion
#include "lite/utils/macros.h"
#define REGISTER_MIR_PASS(name__, class__) \ #define REGISTER_MIR_PASS(name__, class__) \
paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__, \ paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__, \
new class__); \ new class__); \
...@@ -66,4 +69,4 @@ class PassRegistry { ...@@ -66,4 +69,4 @@ class PassRegistry {
return mir_pass_registry##name__.Touch(); \ return mir_pass_registry##name__.Touch(); \
} \ } \
static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__ \ static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__ \
__attribute__((unused)) = mir_pass_registry##name__ UNUSED = mir_pass_registry##name__
...@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply( ...@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(
REGISTER_MIR_PASS(quantized_op_attributes_inference_pass, REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
paddle::lite::mir::QuantizedOpAttributesInferencePass) paddle::lite::mir::QuantizedOpAttributesInferencePass)
.BindTargets({TARGET(kNPU)}); .BindTargets({TARGET(kAPU), TARGET(kRKNPU)});
...@@ -45,9 +45,10 @@ class RuntimeContextAssignPass : public StmtPass { ...@@ -45,9 +45,10 @@ class RuntimeContextAssignPass : public StmtPass {
inst.picked_kernel().target())); inst.picked_kernel().target()));
} }
#else #else
inst.picked_kernel().SetContext( int stream_id = inst.stream_id_;
ContextScheduler::Global().NewContext(inst.picked_kernel().target()));
inst.picked_kernel().SetContext(ContextScheduler::Global().NewContext(
inst.picked_kernel().target(), stream_id));
#endif #endif
} }
} }
......
...@@ -47,8 +47,8 @@ std::string SubgraphVisualizer::operator()() { ...@@ -47,8 +47,8 @@ std::string SubgraphVisualizer::operator()() {
"turquoise4", "snow3", "sienna4", "salmon2", "turquoise4", "snow3", "sienna4", "salmon2",
}; };
std::unordered_map<Node *, int> subgraph_indices; std::unordered_map<Node *, int> subgraph_indices;
for (int i = 0; i < subgraphs_.size(); i++) { for (size_t i = 0; i < subgraphs_.size(); i++) {
for (int j = 0; j < subgraphs_[i].size(); j++) { for (size_t j = 0; j < subgraphs_[i].size(); j++) {
subgraph_indices[subgraphs_[i][j]] = i; subgraph_indices[subgraphs_[i][j]] = i;
} }
} }
...@@ -538,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph, ...@@ -538,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
std::vector<std::vector<Node *>> subgraphs = std::vector<std::vector<Node *>> subgraphs =
SubgraphDetector(graph, teller)(); SubgraphDetector(graph, teller)();
SubgraphVisualizer(graph, subgraphs)(); SubgraphVisualizer(graph, subgraphs)();
for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) { for (size_t subgraph_idx = 0; subgraph_idx < subgraphs.size();
subgraph_idx++) {
if (subgraphs[subgraph_idx].size() >= min_subgraph_size) { if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]); InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
} }
......
...@@ -36,8 +36,8 @@ std::vector<std::string> AddFCDesc( ...@@ -36,8 +36,8 @@ std::vector<std::string> AddFCDesc(
const std::shared_ptr<Scope>& scope, const std::shared_ptr<Scope>& scope,
const std::vector<std::string>& input_var_names, const std::vector<std::string>& input_var_names,
const std::vector<int64_t>& wshape) { const std::vector<int64_t>& wshape) {
CHECK_EQ(input_var_names.size(), 1); CHECK_EQ(input_var_names.size(), 1u);
CHECK_EQ(wshape.size(), 2); CHECK_EQ(wshape.size(), 2u);
static int id = 0; static int id = 0;
std::string prefix = "fc_" + paddle::lite::to_string(id); std::string prefix = "fc_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>(); auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
...@@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) { ...@@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) {
}; };
std::vector<std::vector<mir::Node*>> subgraphs = std::vector<std::vector<mir::Node*>> subgraphs =
mir::SubgraphDetector(graph.get(), teller)(); mir::SubgraphDetector(graph.get(), teller)();
ASSERT_EQ(subgraphs.size(), 1); ASSERT_EQ(subgraphs.size(), 1u);
ASSERT_EQ(graph->nodes().size(), 9); ASSERT_EQ(graph->nodes().size(), 9u);
mir::SubgraphVisualizer(graph.get(), subgraphs)(); mir::SubgraphVisualizer(graph.get(), subgraphs)();
} }
...@@ -221,7 +221,7 @@ TEST(Subgraph, detect_custom_model) { ...@@ -221,7 +221,7 @@ TEST(Subgraph, detect_custom_model) {
std::vector<std::vector<mir::Node*>> subgraphs = std::vector<std::vector<mir::Node*>> subgraphs =
mir::SubgraphDetector(graph.get(), teller)(); mir::SubgraphDetector(graph.get(), teller)();
mir::SubgraphVisualizer(graph.get(), subgraphs)(); mir::SubgraphVisualizer(graph.get(), subgraphs)();
ASSERT_EQ(subgraphs.size(), 1); ASSERT_EQ(subgraphs.size(), 1u);
} }
} // namespace lite } // namespace lite
......
...@@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser(); fuser();
} }
void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) \
supported_lists.insert(#op_type); \
LOG(INFO) << #op_type
#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return; if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
std::unordered_set<std::string> supported_lists; std::unordered_set<std::string> supported_lists;
...@@ -69,6 +85,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -69,6 +85,20 @@ void BMSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser(); fuser();
} }
void RKNPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
#include "lite/kernels/rknpu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists; std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type); #define USE_SUBGRAPH_BRIDGE(op_type, target) supported_lists.insert(#op_type);
...@@ -89,9 +119,13 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -89,9 +119,13 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass) REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
.BindTargets({TARGET(kNPU)}); .BindTargets({TARGET(kNPU)});
REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
.BindTargets({TARGET(kAPU)});
REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.BindTargets({TARGET(kXPU)}); .BindTargets({TARGET(kXPU)});
REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
.BindTargets({TARGET(kBM)}); .BindTargets({TARGET(kBM)});
REGISTER_MIR_PASS(rknpu_subgraph_pass, paddle::lite::mir::RKNPUSubgraphPass)
.BindTargets({TARGET(kRKNPU)});
REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass) REGISTER_MIR_PASS(mlu_subgraph_pass, paddle::lite::mir::MLUSubgraphPass)
.BindTargets({TARGET(kMLU)}); .BindTargets({TARGET(kMLU)});
...@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass { ...@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
}; };
class APUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class XPUSubgraphPass : public ProgramPass { class XPUSubgraphPass : public ProgramPass {
public: public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
...@@ -37,6 +42,11 @@ class BMSubgraphPass : public ProgramPass { ...@@ -37,6 +42,11 @@ class BMSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
}; };
class RKNPUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class MLUSubgraphPass : public ProgramPass { class MLUSubgraphPass : public ProgramPass {
public: public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
......
...@@ -39,7 +39,7 @@ std::vector<std::vector<int64_t>> ShapeParsing(std::string text) { ...@@ -39,7 +39,7 @@ std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
std::vector<std::vector<int64_t>> shapes; std::vector<std::vector<int64_t>> shapes;
std::vector<std::string> shape_strings = Split(text, ":"); std::vector<std::string> shape_strings = Split(text, ":");
shapes.resize(shape_strings.size()); shapes.resize(shape_strings.size());
for (int i = 0; i < shape_strings.size(); i++) { for (size_t i = 0; i < shape_strings.size(); i++) {
std::vector<std::string> shape_nums = Split(shape_strings[i], ","); std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
for (auto shape_num : shape_nums) { for (auto shape_num : shape_nums) {
shapes[i].push_back(atoi(shape_num.c_str())); shapes[i].push_back(atoi(shape_num.c_str()));
...@@ -66,7 +66,7 @@ void FillInputTensors( ...@@ -66,7 +66,7 @@ void FillInputTensors(
for (int j = 0; j < input_tensor_size; j++) { \ for (int j = 0; j < input_tensor_size; j++) { \
input_tensor_data[j] = static_cast<type>(value); \ input_tensor_data[j] = static_cast<type>(value); \
} }
for (int i = 0; i < input_tensor_shape.size(); i++) { for (size_t i = 0; i < input_tensor_shape.size(); i++) {
auto input_tensor = predictor->GetInput(i); auto input_tensor = predictor->GetInput(i);
input_tensor->Resize(input_tensor_shape[i]); input_tensor->Resize(input_tensor_shape[i]);
auto input_tensor_size = ShapeProduction(input_tensor->shape()); auto input_tensor_size = ShapeProduction(input_tensor->shape());
...@@ -95,7 +95,7 @@ void CheckOutputTensors( ...@@ -95,7 +95,7 @@ void CheckOutputTensors(
<< " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; \ << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; \
EXPECT_LT(rel_diff, 0.1); \ EXPECT_LT(rel_diff, 0.1); \
} }
for (int i = 0; i < output_tensor_type.size(); i++) { for (size_t i = 0; i < output_tensor_type.size(); i++) {
auto tar_output_tensor = tar_predictor->GetOutput(i); auto tar_output_tensor = tar_predictor->GetOutput(i);
auto ref_output_tensor = ref_predictor->GetOutput(i); auto ref_output_tensor = ref_predictor->GetOutput(i);
auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape()); auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
......
...@@ -80,7 +80,7 @@ static bool InferScaleFromSubgraph(std::string var_name, ...@@ -80,7 +80,7 @@ static bool InferScaleFromSubgraph(std::string var_name,
auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name); auto input_or_output_scales = op_info->GetAttr<std::vector<float>>(attr_name);
auto size = input_or_output_names.size(); auto size = input_or_output_names.size();
CHECK(size == input_or_output_scales.size()); CHECK(size == input_or_output_scales.size());
for (int i = 0; i < size; i++) { for (size_t i = 0; i < size; i++) {
if (input_or_output_names[i] == var_name) { if (input_or_output_names[i] == var_name) {
*scale = input_or_output_scales[i]; *scale = input_or_output_scales[i];
return true; return true;
...@@ -137,18 +137,23 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -137,18 +137,23 @@ void PrecisionCastPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
nodes.push_back(node); nodes.push_back(node);
} }
// record the copied node.
std::unordered_map<std::string, Node*> cast_nodes;
for (auto& node : nodes) { for (auto& node : nodes) {
if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue; if (!node->IsStmt() || node->AsStmt().op_type() == "while") continue;
auto inlinks = node->inlinks; auto inlinks = node->inlinks;
for (auto* in : inlinks) { for (auto* in : inlinks) {
ComplementInputs(graph.get(), node, in); ComplementInputs(graph.get(), node, in, &cast_nodes);
} }
} }
} }
void PrecisionCastPass::ComplementInputs(SSAGraph* graph, void PrecisionCastPass::ComplementInputs(
Node* inst_node, SSAGraph* graph,
Node* in) { Node* inst_node,
Node* in,
std::unordered_map<std::string, Node*>* cast_nodes) {
// If this input is out of date. // If this input is out of date.
if (inst_node->inlinks.end() == if (inst_node->inlinks.end() ==
std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in)) std::find(inst_node->inlinks.begin(), inst_node->inlinks.end(), in))
...@@ -184,16 +189,19 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph, ...@@ -184,16 +189,19 @@ void PrecisionCastPass::ComplementInputs(SSAGraph* graph,
in, in,
graph, graph,
inst_node, inst_node,
cast_nodes,
graph->valid_places()); graph->valid_places());
} }
} }
void PrecisionCastPass::AddCastInst(const Type& from, void PrecisionCastPass::AddCastInst(
const Type& to, const Type& from,
Node* in, const Type& to,
SSAGraph* graph, Node* in,
Node* inst_node, SSAGraph* graph,
const std::vector<Place>& valid_places) { Node* inst_node,
std::unordered_map<std::string, Node*>* cast_nodes,
const std::vector<Place>& valid_places) {
CHECK(!valid_places.empty()) << "valid_place should be set"; CHECK(!valid_places.empty()) << "valid_place should be set";
// var -> new_transform_op -> new_var -> inst // var -> new_transform_op -> new_var -> inst
...@@ -203,66 +211,80 @@ void PrecisionCastPass::AddCastInst(const Type& from, ...@@ -203,66 +211,80 @@ void PrecisionCastPass::AddCastInst(const Type& from,
auto cast_op_output_name = in->AsArg().name + "/precision_trans"; auto cast_op_output_name = in->AsArg().name + "/precision_trans";
// in->AsArg().name + "/precision_trans/" + // in->AsArg().name + "/precision_trans/" +
// paddle::lite::to_string(node_id()); // paddle::lite::to_string(node_id());
auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name); if (cast_nodes->count(in->AsArg().name)) {
cast_op_output_arg->AsArg().type = // Remove the old link
LiteType::GetTensorTy(from.target(), to.precision(), from.layout()); RemoveDirectedLink(in, inst_node);
auto* cast_inst = graph->NewInstructNode(); // Update the original instruction OpDesc.
// Update its input to the cast_op_output_name
// Add new link, newarg->inst
DirectedLink(cast_nodes->at(in->AsArg().name),
inst_node); // [io_copy kernel]'s output -> [current kernel]
// reset opdesc and update kernel information
UpdateInputs(
inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
} else {
auto* cast_op_output_arg = graph->NewArgumentNode(cast_op_output_name);
cast_op_output_arg->AsArg().type =
LiteType::GetTensorTy(from.target(), to.precision(), from.layout());
auto* cast_inst = graph->NewInstructNode();
// create Op and kernels. // create Op and kernels.
bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist; bool in_persist = in->AsArg().is_weight || in->AsArg().is_persist;
std::string cast_type = in_persist ? "calib_once" : "calib"; std::string cast_type = in_persist ? "calib_once" : "calib";
cast_op_output_arg->AsArg().is_persist = in_persist; cast_op_output_arg->AsArg().is_persist = in_persist;
auto cast_op = LiteOpRegistry::Global().Create(cast_type); auto cast_op = LiteOpRegistry::Global().Create(cast_type);
CHECK(cast_op) << "create op [" << cast_op << "] failed"; CHECK(cast_op) << "create op [" << cast_op << "] failed";
// Create the new var manually. // Create the new var manually.
inst_node->AsStmt().op()->scope()->Var(cast_op_output_name); inst_node->AsStmt().op()->scope()->Var(cast_op_output_name);
// Create Calib Instruction. // Create Calib Instruction.
cpp::OpDesc op_desc; cpp::OpDesc op_desc;
op_desc.SetType(cast_type); op_desc.SetType(cast_type);
op_desc.SetInput("Input", {in->AsArg().name}); op_desc.SetInput("Input", {in->AsArg().name});
op_desc.SetOutput("Out", {cast_op_output_name}); op_desc.SetOutput("Out", {cast_op_output_name});
float scale; float scale;
if (InferScale(in, inst_node, &scale)) { if (InferScale(in, inst_node, &scale)) {
op_desc.SetAttr("scale", scale); op_desc.SetAttr("scale", scale);
} }
cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope()); cast_op->Attach(op_desc, inst_node->AsStmt().op()->scope());
auto kernels = cast_op->CreateKernels(valid_places); auto kernels = cast_op->CreateKernels(valid_places);
std::vector<std::unique_ptr<KernelBase>> selected_kernels; std::vector<std::unique_ptr<KernelBase>> selected_kernels;
bool is_found = false; bool is_found = false;
for (auto& kernel : kernels) { for (auto& kernel : kernels) {
const Type* in_arg_ty = kernel->GetInputDeclType("Input"); const Type* in_arg_ty = kernel->GetInputDeclType("Input");
const Type* out_arg_ty = kernel->GetOutputDeclType("Out"); const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
if (TypeCompatible(*in_arg_ty, from) && if (TypeCompatible(*in_arg_ty, from) &&
out_arg_ty->precision() == to.precision()) { out_arg_ty->precision() == to.precision()) {
is_found = true; is_found = true;
selected_kernels.emplace_back(std::move(kernel)); selected_kernels.emplace_back(std::move(kernel));
// we pick the kernel // we pick the kernel
cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op); cast_inst->AsStmt(cast_type, std::move(selected_kernels), cast_op);
break; (*cast_nodes)[in->AsArg().name] = cast_op_output_arg;
break;
}
} }
}
CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":" CHECK(is_found) << "Can't find a Cast kernel for Cast op: " << from << ":"
<< in->AsArg().name << "->" << to << ":" << in->AsArg().name << "->" << to << ":"
<< inst_node->AsStmt().op_info()->Type(); << inst_node->AsStmt().op_info()->Type();
// Remove the old link // Remove the old link
RemoveDirectedLink(in, inst_node); RemoveDirectedLink(in, inst_node);
// Update the original instruction OpDesc. // Update the original instruction OpDesc.
// Update its input to the io_copy_output_name // Update its input to the io_copy_output_name
// Add new link, var -> new_inst, new_inst->newarg, newarg->inst // Add new link, var -> new_inst, new_inst->newarg, newarg->inst
DirectedLink(in, cast_inst); DirectedLink(in, cast_inst);
DirectedLink(cast_inst, cast_op_output_arg); DirectedLink(cast_inst, cast_op_output_arg);
DirectedLink(cast_op_output_arg, inst_node); DirectedLink(cast_op_output_arg, inst_node);
// reset opdesc and update kernel information // reset opdesc and update kernel information
UpdateInputs( UpdateInputs(
inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name); inst_node->AsStmt().op().get(), in->AsArg().name, cast_op_output_name);
}
// recreate the op // recreate the op
auto original_selected_kernel = auto original_selected_kernel =
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册