提交 c5b969dc 编写于 作者: J jackzhang235

Merge remote-tracking branch 'upstream/develop' into develop

......@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
message(STATUS "AR tools: ${CMAKE_AR}")
if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
if (MSVC_STATIC_CRT)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
endif()
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
add_compile_options(/MP)
message(STATUS "Using parallel compiling (/MP)")
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
find_package(CUDA QUIET)
endif()
......@@ -59,8 +84,10 @@ lite_option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
lite_option(LITE_WITH_X86 "Enable X86 in lite mode" ON)
lite_option(LITE_WITH_ARM "Enable ARM in lite mode" OFF)
lite_option(LITE_WITH_NPU "Enable NPU in lite mode" OFF)
lite_option(LITE_WITH_RKNPU "Enable RKNPU in lite mode" OFF)
lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
......@@ -68,7 +95,7 @@ lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
lite_option(LITE_WITH_FPGA "Enable FPGA support in lite" OFF)
lite_option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK "Enable light-weight framework" OFF)
lite_option(LITE_WITH_PROFILE "Enable profile mode in lite framework" OFF)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF IF LITE_WITH_PROFILE)
lite_option(LITE_WITH_PRECISION_PROFILE "Enable precision profile in profile mode ON in lite" OFF)
lite_option(LITE_SHUTDOWN_LOG "Shutdown log system or not." OFF)
lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
......@@ -104,9 +131,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
# CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE)
if(WIN32)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE)
else()
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE)
endif()
endif()
message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
......@@ -128,6 +162,10 @@ if (LITE_WITH_PYTHON)
include(external/pybind11) # download, build, install pybind11
endif()
if(LITE_WITH_RKNPU)
include(device/rknpu)
endif()
# for mobile
if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......@@ -184,6 +222,7 @@ endif()
include(external/mklml) # download mklml package
include(external/xbyak) # download xbyak package
include(external/libxsmm) # download, build, install libxsmm
include(external/gflags) # download, build, install gflags
include(external/glog) # download, build, install glog
......@@ -208,7 +247,9 @@ include(generic) # simplify cmake module
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(version) # set PADDLE_VERSION
include(flags)
if(NOT APPLE)
include(flags)
endif()
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
......
@echo off
setlocal
setlocal enabledelayedexpansion
set source_path=%~dp0
rem global variables
set BUILD_EXTRA=OFF
set BUILD_JAVA=ON
set BUILD_PYTHON=OFF
set BUILD_DIR=%source_path%
set OPTMODEL_DIR=""
set BUILD_TAILOR=OFF
set BUILD_CV=OFF
set SHUTDOWN_LOG=ON
set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
set workspace=%source_path%
:set_vcvarsall_dir
SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat =======>"
set tmp_var=!vcvarsall_dir!
call:remove_space
set vcvarsall_dir=!tmp_var!
IF NOT EXIST "%vcvarsall_dir%" (
echo "------------%vcvarsall_dir% not exist------------"
goto set_vcvarsall_dir
)
call:prepare_thirdparty
if EXIST "%build_directory%" (
call:rm_rebuild_dir "%build_directory%"
md "%build_directory%"
)
set root_dir=%workspace%
set build_directory=%BUILD_DIR%\build.lite.x86
set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code
set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug
rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
rem here we fake an empty file to make cmake works.
if NOT EXIST "%GEN_CODE_PATH_PREFIX%" (
md "%GEN_CODE_PATH_PREFIX%"
)
type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc"
if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" (
md "%DEBUG_TOOL_PATH_PREFIX%"
)
copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\"
cd "%build_directory%"
cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_MKL=ON ^
-DWITH_MKLDNN=OFF ^
-DLITE_WITH_X86=ON ^
-DLITE_WITH_PROFILE=OFF ^
-DWITH_LITE=ON ^
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^
-DLITE_WITH_ARM=OFF ^
-DWITH_GPU=OFF ^
-DLITE_BUILD_EXTRA=ON ^
-DLITE_WITH_PYTHON=ON ^
-DPYTHON_EXECUTABLE="%python_path%"
call "%vcvarsall_dir%" amd64
msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1
goto:eof
:prepare_thirdparty
SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe =======>"
set tmp_var=!python_path!
call:remove_space
set python_path=!tmp_var!
if "!python_path!"=="" (
set python_path=python.exe
) else (
if NOT exist "!python_path!" (
echo "------------!python_path! not exist------------"
goto:eof
)
)
if EXIST "%workspace%\third-party" (
if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists."
) else (
echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
call:rm_rebuild_dir "%workspace%\third-party"
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
)
) else (
if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
call:download_third_party
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
) else (
echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
)
)
git submodule update --init --recursive
goto:eof
:download_third_party
powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^
'%workspace%third-party-05b862.tar.gz')
goto:eof
:rm_rebuild_dir
del /f /s /q "%~1\*.*" >nul 2>&1
rd /s /q "%~1" >nul 2>&1
goto:eof
:remove_space
:remove_left_space
if "%tmp_var:~0,1%"==" " (
set "tmp_var=%tmp_var:~1%"
goto remove_left_space
)
:remove_right_space
if "%tmp_var:~-1%"==" " (
set "tmp_var=%tmp_var:~0,-1%"
goto remove_left_space
)
goto:eof
\ No newline at end of file
......@@ -34,6 +34,15 @@ elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG})
endif()
if(WIN32)
# windows header option for all targets.
add_definitions(-D_XKEYCHECK_H)
if (NOT MSVC)
message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
endif(NOT MSVC)
endif(WIN32)
if(LITE_WITH_CUDA)
add_definitions(-DLITE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU)
......@@ -70,7 +79,7 @@ endif()
if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
if(WIN32)
if(WIN32 OR APPLE)
# openmp not support well for now on windows
set(OPENMP_FLAGS "")
else(WIN32)
......@@ -134,8 +143,15 @@ if (LITE_WITH_NPU)
add_definitions("-DLITE_WITH_NPU")
endif()
if (LITE_WITH_RKNPU)
add_definitions("-DLITE_WITH_RKNPU")
endif()
if (LITE_WITH_XPU)
add_definitions("-DLITE_WITH_XPU")
if (LITE_WITH_XTCL)
add_definitions("-DLITE_WITH_XTCL")
endif()
endif()
if (LITE_WITH_OPENCL)
......@@ -156,9 +172,10 @@ endif()
if (LITE_WITH_PROFILE)
add_definitions("-DLITE_WITH_PROFILE")
if (LITE_WITH_PRECISION_PROFILE)
add_definitions("-DLITE_WITH_PRECISION_PROFILE")
endif()
endif()
if (LITE_WITH_PRECISION_PROFILE)
add_definitions("-DLITE_WITH_PRECISION_PROFILE")
endif()
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
......@@ -177,3 +194,6 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
add_definitions("-DLITE_ON_MODEL_OPTIMIZE_TOOL")
endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
if (LITE_WITH_PYTHON)
add_definitions("-DLITE_WITH_PYTHON")
endif(LITE_WITH_PYTHON)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_RKNPU)
return()
endif()
if(NOT DEFINED RKNPU_DDK_ROOT)
set(RKNPU_DDK_ROOT $ENV{RKNPU_DDK_ROOT})
if(NOT RKNPU_DDK_ROOT)
message(FATAL_ERROR "Must set RKNPU_DDK_ROOT or env RKNPU_DDK_ROOT when LITE_WITH_RKNPU=ON")
endif()
endif()
message(STATUS "RKNPU_DDK_ROOT: ${RKNPU_DDK_ROOT}")
find_path(RKNPU_DDK_INC NAMES rknpu/rknpu_pub.h
PATHS ${RKNPU_DDK_ROOT}/include/ NO_DEFAULT_PATH)
if(NOT RKNPU_DDK_INC)
message(FATAL_ERROR "Can not find rknpu_pub.h in ${RKNPU_DDK_ROOT}/include")
endif()
include_directories("${RKNPU_DDK_ROOT}/include")
set(RKNPU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(RKNPU_SUB_LIB_PATH "lib64")
endif()
if(ARM_TARGET_ARCH_ABI STREQUAL "armv7")
set(RKNPU_SUB_LIB_PATH "lib")
endif()
find_library(RKNPU_DDK_FILE NAMES rknpu_ddk
PATHS ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH})
if(NOT RKNPU_DDK_FILE)
message(FATAL_ERROR "Can not find RKNPU_DDK_FILE in ${RKNPU_DDK_ROOT}/${RKNPU_SUB_LIB_PATH}")
else()
message(STATUS "Found RKNPU_DDK_FILE Library: ${RKNPU_DDK_FILE}")
add_library(rknpu_ddk SHARED IMPORTED GLOBAL)
set_property(TARGET rknpu_ddk PROPERTY IMPORTED_LOCATION ${RKNPU_DDK_FILE})
endif()
set(rknpu_runtime_libs rknpu_ddk CACHE INTERNAL "rknpu ddk runtime libs")
......@@ -22,42 +22,10 @@ if(NOT DEFINED XPU_SDK_ROOT)
message(FATAL_ERROR "Must set XPU_SDK_ROOT or env XPU_SDK_ROOT when LITE_WITH_XPU=ON")
endif()
endif()
message(STATUS "XPU_SDK_ROOT: ${XPU_SDK_ROOT}")
find_path(XPU_SDK_INC NAMES xtcl.h
PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl
NO_DEFAULT_PATH)
if(NOT XPU_SDK_INC)
message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
endif()
include_directories("${XPU_SDK_ROOT}/XTCL/include")
include_directories("${XPU_SDK_ROOT}/XTDK/include")
find_library(XPU_SDK_XTCL_FILE NAMES xtcl
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XTCL_FILE)
message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
endif()
find_library(XPU_SDK_TVM_FILE NAMES tvm
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_TVM_FILE)
message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
endif()
find_library(XPU_SDK_XPU_API_FILE NAMES xpuapi
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
......@@ -82,23 +50,55 @@ else()
set_property(TARGET xpu_sdk_xpu_rt PROPERTY IMPORTED_LOCATION ${XPU_SDK_XPU_RT_FILE})
endif()
find_library(XPU_SDK_XPU_JITC_FILE NAMES xpujitc
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
if(NOT XPU_SDK_LLVM_FILE)
message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
set(xpu_runtime_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xpu_api xpu_sdk_xpu_rt CACHE INTERNAL "xpu builder libs")
if(LITE_WITH_XTCL)
find_path(XPU_SDK_INC NAMES xtcl.h
PATHS ${XPU_SDK_ROOT}/XTCL/include/xtcl NO_DEFAULT_PATH)
if(NOT XPU_SDK_INC)
message(FATAL_ERROR "Can not find xtcl.h in ${XPU_SDK_ROOT}/include")
endif()
include_directories("${XPU_SDK_ROOT}/XTCL/include")
find_library(XPU_SDK_XTCL_FILE NAMES xtcl
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_XTCL_FILE)
message(FATAL_ERROR "Can not find XPU XTCL Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU XTCL Library: ${XPU_SDK_XTCL_FILE}")
add_library(xpu_sdk_xtcl SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_xtcl PROPERTY IMPORTED_LOCATION ${XPU_SDK_XTCL_FILE})
endif()
find_library(XPU_SDK_TVM_FILE NAMES tvm
PATHS ${XPU_SDK_ROOT}/XTCL/so
NO_DEFAULT_PATH)
if(NOT XPU_SDK_TVM_FILE)
message(FATAL_ERROR "Can not find XPU TVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU TVM Library: ${XPU_SDK_TVM_FILE}")
add_library(xpu_sdk_tvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_tvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_TVM_FILE})
endif()
find_library(XPU_SDK_LLVM_FILE NAMES LLVM-8
PATHS ${XPU_SDK_ROOT}/XTDK/shlib
NO_DEFAULT_PATH)
if(NOT XPU_SDK_LLVM_FILE)
message(FATAL_ERROR "Can not find LLVM Library in ${XPU_SDK_ROOT}")
else()
message(STATUS "Found XPU LLVM Library: ${XPU_SDK_LLVM_FILE}")
add_library(xpu_sdk_llvm SHARED IMPORTED GLOBAL)
set_property(TARGET xpu_sdk_llvm PROPERTY IMPORTED_LOCATION ${XPU_SDK_LLVM_FILE})
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=1")
set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DDMLC_USE_GLOG=0")
set(xpu_runtime_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu runtime libs")
set(xpu_builder_libs xpu_sdk_xtcl xpu_sdk_tvm xpu_sdk_xpu_api xpu_sdk_xpu_rt xpu_sdk_llvm CACHE INTERNAL "xpu builder libs")
......@@ -36,7 +36,16 @@ else()
# eigen on cuda9.1 missing header of math_funtions.hpp
# https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
GIT_TAG
URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
######################################################################################################
# url address of eigen before v2.3.0
# URL http://paddle-inference-dist.bj.bcebos.com/PaddleLite_ThirdParty%2Feigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
######################################################################################################
# url address of eigen since v2.6.0
# github address: https://github.com/eigenteam/eigen-git-mirror
# we changed the source code to adapt for windows compiling
# git diffs : (1) unsupported/Eigen/CXX11/src/Tensor/TensorBlockV2.h
######################################################################################################
URL https://paddlelite-data.bj.bcebos.com/third_party_libs/eigen-git-mirror-master-9ab917e9db99f5907d086aa73d5f9103.zip
DOWNLOAD_DIR ${EIGEN_SOURCECODE_DIR}
DOWNLOAD_NO_PROGRESS 1
PREFIX ${EIGEN_SOURCE_DIR}
......
......@@ -16,12 +16,6 @@ IF(NOT ${WITH_MKLML})
return()
ENDIF(NOT ${WITH_MKLML})
IF(APPLE)
MESSAGE(WARNING "Mac is not supported with MKLML in Paddle yet. Force WITH_MKLML=OFF.")
SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in MacOS" FORCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
......@@ -38,7 +32,17 @@ IF(WIN32)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_LIB_DEPS ${MKLML_LIB_DIR}/msvcr120.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
ELSEIF(APPLE)
#TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "mklml_mac_2019.0.5.20190502" CACHE STRING "" FORCE)
SET(MKLML_URL "https://paddlelite-data.bj.bcebos.com/third_party_libs/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml.dylib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/libmklml.dylib)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.dylib)
ELSE()
#TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
......
......@@ -70,10 +70,10 @@ SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
SET(py_env "")
IF(PYTHONINTERP_FOUND)
find_python_module(pip REQUIRED)
find_python_module(numpy REQUIRED)
#find_python_module(numpy REQUIRED)
#find_python_module(wheel REQUIRED)
#find_python_module(google.protobuf REQUIRED)
FIND_PACKAGE(NumPy REQUIRED)
#FIND_PACKAGE(NumPy REQUIRED)
#IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
# MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
# "please use pip to upgrade protobuf. pip install -U protobuf")
......
......@@ -276,7 +276,7 @@ function(cc_library TARGET_NAME)
add_dependencies(${TARGET_NAME} mklml)
if(WIN32)
target_link_libraries(${TARGET_NAME} ${MKLML_IOMP_LIB})
else(WIN32)
elseif(NOT APPLE)
target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif(WIN32)
endif()
......
......@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS})
......@@ -88,6 +88,12 @@ function (lite_deps TARGET)
endforeach(var)
endif()
if (LITE_WITH_RKNPU)
foreach(var ${lite_deps_RKNPU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
if (LITE_WITH_XPU)
foreach(var ${lite_deps_XPU_DEPS})
set(deps ${deps} ${var})
......@@ -131,7 +137,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -142,6 +148,7 @@ function(lite_cc_library TARGET)
CUDA_DEPS ${args_CUDA_DEPS}
CL_DEPS ${args_CL_DEPS}
BM_DEPS ${args_BM_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
ARM_DEPS ${args_ARM_DEPS}
CV_DEPS ${args_CV_DEPS}
FPGA_DEPS ${args_FPGA_DEPS}
......@@ -161,8 +168,10 @@ function(lite_cc_library TARGET)
else()
cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
endif()
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
# collect targets need to compile for lite
if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
add_dependencies(lite_compile_deps ${TARGET})
......@@ -177,7 +186,7 @@ function(lite_cc_binary TARGET)
set(options " -g ")
endif()
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -191,7 +200,8 @@ function(lite_cc_binary TARGET)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -199,7 +209,9 @@ function(lite_cc_binary TARGET)
MLU_DEPS ${args_MLU_DEPS}
)
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
if (NOT APPLE)
# strip binary target to reduce size
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
......@@ -226,7 +238,7 @@ function(lite_cc_test TARGET)
endif()
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS
COMPILE_LEVEL # (basic|extra)
......@@ -248,7 +260,8 @@ function(lite_cc_test TARGET)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
HVY_DEPS ${args_HVY_DEPS}
......@@ -263,7 +276,9 @@ function(lite_cc_test TARGET)
"${TARGET}"
COMMENT "Strip debug symbols done on final executable file.")
endif()
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
file(APPEND ${offline_test_registry_file} "${TARGET}\n")
# collect targets need to compile for lite
......@@ -280,6 +295,7 @@ set(npu_kernels CACHE INTERNAL "npu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels")
set(rknpu_kernels CACHE INTERNAL "rknpu kernels")
set(opencl_kernels CACHE INTERNAL "opencl kernels")
set(host_kernels CACHE INTERNAL "host kernels")
......@@ -295,12 +311,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif()
# add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM)
# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra)
function(add_kernel TARGET device level)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
......@@ -317,9 +333,18 @@ function(add_kernel TARGET device level)
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
return()
endif()
if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
return()
endif()
if ("${device}" STREQUAL "Host")
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(host_kernels "${host_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "ARM")
......@@ -332,16 +357,11 @@ function(add_kernel TARGET device level)
set(arm_kernels "${arm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "X86")
if (NOT LITE_WITH_X86)
if (NOT LITE_WITH_X86 OR LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
elseif (LITE_ON_MODEL_OPTIMIZE_TOOL)
foreach(src ${args_SRCS})
file(APPEND ${kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(x86_kernels "${x86_kernels};${TARGET}" CACHE INTERNAL "")
endif()
......@@ -381,8 +401,20 @@ function(add_kernel TARGET device level)
endif()
set(bm_kernels "${bm_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "RKNPU")
if (NOT LITE_WITH_RKNPU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(rknpu_kernels "${rknpu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "MLU")
if (NOT LITE_WITH_MLU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(mlu_kernels "${mlu_kernels};${TARGET}" CACHE INTERNAL "")
......@@ -426,7 +458,8 @@ function(add_kernel TARGET device level)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
......@@ -451,11 +484,13 @@ function(add_operator TARGET level)
ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
return()
endif()
if ("${level}" STREQUAL "train" AND (NOT LITE_WITH_TRAIN))
return()
endif()
foreach(src ${args_SRCS})
if(LITE_BUILD_TAILOR)
......@@ -478,7 +513,8 @@ function(add_operator TARGET level)
FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS}
XPU_DEPS ${args_XPU_DEPS}
BM_DEPS ${args_BM_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS}
MLU_DEPS ${args_MLU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS}
......@@ -486,6 +522,29 @@ function(add_operator TARGET level)
)
endfunction()
#only for windows
function(create_static_lib TARGET_NAME)
set(libs ${ARGN})
list(REMOVE_DUPLICATES libs)
set(dummy_index 1)
set(dummy_offset 1)
# the dummy target would be consisted of limit size libraries
set(dummy_limit 60)
list(LENGTH libs libs_len)
foreach(lib ${libs})
list(APPEND dummy_list ${lib})
list(LENGTH dummy_list listlen)
if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
set(dummy_list)
list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
MATH(EXPR dummy_index "${dummy_index}+1")
endif()
MATH(EXPR dummy_offset "${dummy_offset}+1")
endforeach()
merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
endfunction()
# Bundle several static libraries into one.
function(bundle_static_library tgt_name bundled_tgt_name fake_target)
......@@ -529,7 +588,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
set(bundled_tgt_full_name
${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
#message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}")
message(STATUS "bundled_tgt_full_name: ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
if(WIN32)
set(dummy_tgt_name dummy_${bundled_tgt_name})
create_static_lib(${bundled_tgt_name} ${static_libs})
add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name})
add_dependencies(${fake_target} ${tgt_name})
add_library(${dummy_tgt_name} STATIC IMPORTED)
set_target_properties(${dummy_tgt_name}
PROPERTIES
IMPORTED_LOCATION ${bundled_tgt_full_name}
INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
add_dependencies(${dummy_tgt_name} ${fake_target})
return()
endif()
if(NOT IOS)
file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
......
......@@ -7,7 +7,9 @@ message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")
message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
message(STATUS "LITE_WITH_BM:\t${LITE_WITH_BM}")
......@@ -75,6 +77,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (LITE_WITH_BM)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.bm")
endif(LITE_WITH_BM)
if (LITE_WITH_RKNPU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.rknpu")
endif(LITE_WITH_RKNPU)
else()
set(INFER_LITE_PUBLISH_ROOT "${CMAKE_BINARY_DIR}/inference_lite_lib")
endif()
......@@ -82,16 +87,59 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
# add python lib
if (LITE_WITH_PYTHON)
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
if(WIN32)
set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd")
set(LITE_CORE_DEPS ${LITE_CORE})
add_custom_command(OUTPUT ${LITE_CORE}
COMMAND cmake -E copy $<TARGET_FILE:lite_pybind> ${LITE_CORE}
DEPENDS lite_pybind)
add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS})
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd"
DEPENDS copy_lite_pybind
)
add_custom_target(publish_inference_python_installer ${TARGET}
COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
DEPENDS publish_inference_python_lib)
add_custom_target(publish_inference_python_light_demo ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
)
add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo)
else()
if(APPLE)
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.dylib" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
else()
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.so"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/python/pybind/liblite_pybind.so" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.so")
endif()
add_custom_target(publish_inference_python_installer ${TARGET}
COMMAND python setup.py bdist_wheel
COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
DEPENDS publish_inference_python_lib)
add_custom_target(publish_inference_python_light_demo ${TARGET}
......@@ -107,10 +155,27 @@ if (LITE_WITH_PYTHON)
add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo)
endif(WIN32)
endif()
if (LITE_WITH_X86)
add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
if (LITE_WITH_CUDA OR LITE_WITH_X86)
if(APPLE)
add_custom_target(publish_inference_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.dylib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_custom_target(publish_inference_third_party ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cxx_lib)
add_dependencies(publish_inference publish_inference_third_party)
elseif(NOT WIN32)
add_custom_target(publish_inference_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
......@@ -118,50 +183,76 @@ if (LITE_WITH_X86)
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_custom_target(publish_inference_third_party ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party")
add_dependencies(publish_inference_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cxx_lib)
add_dependencies(publish_inference publish_inference_third_party)
endif()
endif()
if (LITE_WITH_X86)
if(WIN32)
add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
else()
add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
)
add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
add_dependencies(publish_inference_x86_cxx_lib test_model_bin)
add_dependencies(publish_inference_x86_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_x86_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_dependencies(publish_inference publish_inference_x86_cxx_demos)
endif()
endif()
if(LITE_WITH_CUDA)
add_custom_target(publish_inference_cuda_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_SOURCE_DIR}/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/*.so" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_dependencies(publish_inference_cuda_cxx_lib bundle_full_api)
add_dependencies(publish_inference_cuda_cxx_lib bundle_light_api)
add_dependencies(publish_inference_cuda_cxx_lib paddle_full_api_shared)
add_dependencies(publish_inference_cuda_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cuda_cxx_lib)
add_custom_target(publish_inference_cuda_cxx_demos ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND cp -r "${CMAKE_BINARY_DIR}/third_party/install/*" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/cuda_demo/*" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_cuda_cxx_lib publish_inference_cuda_cxx_demos)
add_dependencies(publish_inference_cuda_cxx_demos paddle_full_api_shared)
endif(LITE_WITH_CUDA)
add_dependencies(publish_inference publish_inference_cuda_cxx_demos)
endif(LITE_WITH_CUDA)
if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (NOT LITE_ON_TINY_PUBLISH)
# add cxx lib
......@@ -193,7 +284,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
add_dependencies(publish_inference publish_inference_cxx_lib)
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
add_custom_command(TARGET publish_inference_cxx_lib POST_BUILD
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a)
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.a
COMMAND ${CMAKE_STRIP} "--strip-debug" ${INFER_LITE_PUBLISH_ROOT}/cxx/lib/*.so)
endif()
endif()
else()
......
......@@ -8,39 +8,48 @@ if (LITE_ON_TINY_PUBLISH)
set(CMAKE_CXX_FLAGS_RELEASE "-Os -DNDEBUG")
set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
endif()
set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
set(light_lib_DEPS light_api paddle_api paddle_api_light)
if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH_BM OR ARM_TARGET_OS STREQUAL "android" OR ARM_TARGET_OS STREQUAL "armlinux"))
#full api dynamic library
add_library(paddle_full_api_shared SHARED "")
target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
lite_cc_library(paddle_full_api_shared SHARED SRCS paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc
DEPS paddle_api paddle_api_light paddle_api_full)
add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
target_link_libraries(paddle_full_api_shared framework_proto)
if(LITE_WITH_X86)
add_dependencies(paddle_full_api_shared xxhash)
target_link_libraries(paddle_full_api_shared xxhash)
if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
add_dependencies(paddle_full_api_shared dynload_mklml)
endif()
if(WIN32)
target_link_libraries(paddle_full_api_shared shlwapi.lib)
endif()
endif()
if(LITE_WITH_CUDA)
target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
endif(LITE_WITH_CUDA)
#light api dynamic library
lite_cc_library(paddle_light_api_shared MODULE
SRCS light_api_shared.cc
DEPS ${light_lib_DEPS}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels})
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels})
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
add_dependencies(paddle_full_api_shared custom_linker_map)
lite_cc_library(paddle_light_api_shared SHARED SRCS paddle_api.cc light_api.cc light_api_impl.cc
DEPS ${light_lib_DEPS}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
RKNPU_DEPS ${rknpu_kernels}
)
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels})
if(NOT APPLE AND NOT WIN32)
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
add_custom_target(custom_linker_map DEPENDS ${LINK_MAP_FILE})
set_target_properties(paddle_full_api_shared PROPERTIES LINK_FLAGS ${LINK_FLAGS})
add_dependencies(paddle_full_api_shared custom_linker_map)
endif()
else()
if ((ARM_TARGET_OS STREQUAL "android") OR (ARM_TARGET_OS STREQUAL "armlinux"))
add_library(paddle_light_api_shared SHARED "")
......@@ -55,6 +64,11 @@ else()
# Need to add HIAI runtime libs (libhiai.so) dependency
target_link_libraries(paddle_light_api_shared ${npu_builder_libs} ${npu_runtime_libs})
endif()
if (LITE_WITH_RKNPU)
# Need to add RKNPU runtime libs dependency
target_link_libraries(paddle_light_api_shared ${rknpu_builder_libs} ${rknpu_runtime_libs})
endif()
endif()
endif()
......@@ -65,6 +79,7 @@ if (WITH_TESTING)
CUDA_DEPS ${cuda_kernels}
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels})
endif()
......@@ -78,6 +93,12 @@ if(LITE_WITH_BM)
set(cxx_api_deps ${cxx_api_deps} ${bm_deps})
endif()
if(LITE_WITH_RKNPU)
set(light_api_deps ${light_api_deps} ${rknpu_deps})
set(cxx_api_deps ${cxx_api_deps} ${rknpu_deps})
endif()
message(STATUS "get ops ${ops}")
message(STATUS "get X86 kernels ${x86_kernels}")
message(STATUS "get CUDA kernels ${cuda_kernels}")
......@@ -86,6 +107,7 @@ message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get OpenCL kernels ${opencl_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}")
message(STATUS "get BM kernels ${bm_kernels}")
message(STATUS "get MLU kernels ${mlu_kernels}")
......@@ -103,6 +125,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
......@@ -124,6 +147,7 @@ lite_cc_library(light_api SRCS light_api.cc
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
......@@ -143,6 +167,7 @@ if(WITH_TESTING)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}
......@@ -188,7 +213,11 @@ if(WITH_TESTING)
lite_cc_test(test_classify_lite_bm SRCS test_classify_lite_bm.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
ARGS --model_dir=${LITE_MODEL_DIR}/resnet50)
ARGS --model_dir=${LITE_MODEL_DIR}/classify)
lite_cc_test(test_yolov3_lite_bm SRCS test_yolov3_lite_bm.cc
DEPS mir_passes lite_api_test_helper paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels} ${bm_kernels} ${bm_bridges}
ARGS --model_dir=${LITE_MODEL_DIR}/yolov3)
endif()
endif()
endif()
......@@ -240,6 +269,7 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
ARGS --cl_path=${CMAKE_SOURCE_DIR}/lite/backends/opencl
--model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
add_dependencies(test_inceptionv4 extern_lite_download_inception_v4_simple_tar_gz)
# brief: we comment ocr_test_ut because we do not supply ocr model to test, it is the reference to infer nlp model
# lite_cc_test(test_ocr_attention SRCS ocr_attention_test.cc
# DEPS ${lite_model_test_DEPS})
......@@ -266,7 +296,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels})
FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels})
# The final inference library for just MobileConfig.
bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
target_link_libraries(paddle_api_full ${cuda_deps})
......@@ -282,6 +313,7 @@ lite_cc_test(test_light_api SRCS light_api_test.cc
DEPS light_api program mir_passes paddle_api_light
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
......@@ -291,6 +323,7 @@ lite_cc_test(test_apis SRCS apis_test.cc
X86_DEPS ${x86_kernels}
XPU_DEPS ${xpu_kernels}
FPGA_DEPS ${fpga_kernels}
RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}
ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
......@@ -316,7 +349,7 @@ add_dependencies(opt_base supported_kernel_op_info_h framework_proto all_kernel_
if (LITE_ON_MODEL_OPTIMIZE_TOOL)
message(STATUS "Compiling opt")
lite_cc_binary(opt SRCS opt.cc cxx_api_impl.cc paddle_api.cc cxx_api.cc
DEPS gflags kernel op optimizer mir_passes utils)
DEPS gflags kernel op optimizer mir_passes utils ${host_kernels})
add_dependencies(opt op_list_h kernel_list_h all_kernel_faked_cc supported_kernel_op_info_h)
endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
......@@ -326,6 +359,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels}
X86_DEPS ${x86_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -347,6 +381,7 @@ if(NOT IOS)
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -360,6 +395,7 @@ if(NOT IOS)
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -373,6 +409,7 @@ if(NOT IOS)
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
......@@ -383,6 +420,7 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
......@@ -395,17 +433,20 @@ if(NOT IOS)
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels}
CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels}
FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels}
CUDA_DEPS ${cuda_kernels})
lite_cc_binary(test_transformer SRCS transform_test.cc DEPS paddle_api_full paddle_api_light gflags utils
${ops} ${host_kernels}
ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels}
RKNPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels}
CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels}
......
......@@ -63,6 +63,7 @@ USE_LITE_OP(swish)
USE_LITE_OP(log)
USE_LITE_OP(exp)
USE_LITE_OP(conv2d_transpose)
USE_LITE_OP(depthwise_conv2d_transpose)
USE_LITE_OP(negative)
USE_LITE_OP(pad2d)
USE_LITE_OP(power)
......
......@@ -13,7 +13,13 @@
// limitations under the License.
#include <gflags/gflags.h>
#if !defined(_WIN32)
#include <sys/time.h>
#else
#include <windows.h>
#include "lite/backends/x86/port.h"
#endif
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include <time.h>
#include <algorithm>
#include <cstdio>
......@@ -27,6 +33,9 @@
#include "lite/utils/cp_logging.h"
#include "lite/utils/string.h"
DEFINE_string(optimized_model_path,
"",
"the path of the model that is optimized by opt.");
DEFINE_string(model_dir,
"",
"the path of the model, the model and param files is under "
......@@ -44,7 +53,10 @@ DEFINE_string(input_shape,
"set input shapes according to the model, "
"separated by colon and comma, "
"such as 1,3,244,244");
DEFINE_string(input_img_path, "", "the path of input image");
DEFINE_string(input_img_path,
"",
"the path of input image, if not set "
"input_img_path, the input of model will be 1.0.");
DEFINE_int32(warmup, 0, "warmup times");
DEFINE_int32(repeats, 1, "repeats times");
DEFINE_int32(power_mode,
......@@ -57,16 +69,8 @@ DEFINE_int32(power_mode,
DEFINE_int32(threads, 1, "threads num");
DEFINE_string(result_filename,
"result.txt",
"save benchmark "
"result to the file");
DEFINE_bool(run_model_optimize,
false,
"if set true, apply model_optimize_tool to "
"model and use optimized model to test. ");
DEFINE_bool(is_quantized_model,
false,
"if set true, "
"test the performance of the quantized model. ");
"save the inference time to the file.");
DEFINE_bool(show_output, false, "Wether to show the output in shell.");
namespace paddle {
namespace lite_api {
......@@ -87,10 +91,6 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
std::vector<Place> vaild_places = {
Place{TARGET(kARM), PRECISION(kFloat)},
};
if (FLAGS_is_quantized_model) {
vaild_places.insert(vaild_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
}
config.set_valid_places(vaild_places);
auto predictor = lite_api::CreatePaddlePredictor(config);
......@@ -106,15 +106,23 @@ void OutputOptModel(const std::string& save_optimized_model_dir) {
LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
}
int64_t ShapeProduction(const std::vector<int64_t>& shape) {
int64_t num = 1;
for (auto i : shape) {
num *= i;
}
return num;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
void Run(const std::vector<int64_t>& input_shape,
const std::string& model_dir,
const std::string& model_path,
const std::string model_name) {
// set config and create predictor
lite_api::MobileConfig config;
config.set_threads(FLAGS_threads);
config.set_power_mode(static_cast<PowerMode>(FLAGS_power_mode));
config.set_model_from_file(model_dir + ".nb");
config.set_model_from_file(model_path);
auto predictor = lite_api::CreatePaddlePredictor(config);
......@@ -122,10 +130,7 @@ void Run(const std::vector<int64_t>& input_shape,
auto input_tensor = predictor->GetInput(0);
input_tensor->Resize(input_shape);
auto input_data = input_tensor->mutable_data<float>();
int input_num = 1;
for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i];
}
int64_t input_num = ShapeProduction(input_shape);
if (FLAGS_input_img_path.empty()) {
for (int i = 0; i < input_num; ++i) {
input_data[i] = 1.f;
......@@ -173,26 +178,78 @@ void Run(const std::vector<int64_t>& input_shape,
ofs << "average = " << std::setw(12) << avg_res;
ofs << std::endl;
ofs.close();
if (FLAGS_show_output) {
auto out_tensor = predictor->GetOutput(0);
auto* out_data = out_tensor->data<float>();
int64_t output_num = ShapeProduction(out_tensor->shape());
float max_value = out_data[0];
int max_index = 0;
for (int i = 0; i < output_num; i++) {
if (max_value < out_data[i]) {
max_value = out_data[i];
max_index = i;
}
}
LOG(INFO) << "max_value:" << max_value;
LOG(INFO) << "max_index:" << max_index;
LOG(INFO) << "output data[0:10]:";
for (int i = 0; i < 10; i++) {
LOG(INFO) << out_data[i];
}
}
}
#endif
} // namespace lite_api
} // namespace paddle
void print_usage() {
std::string help_info =
"Usage: \n"
"./benchmark_bin \n"
" --optimized_model_path (The path of the model that is optimized\n"
" by opt. If the model is optimized, please set the param.) \n"
" type: string \n"
" --model_dir (The path of the model that is not optimized by opt,\n"
" the model and param files is under model_dir.) type: string \n"
" --model_filename (The filename of model file. When the model is\n "
" combined formate, please set model_file. Otherwise, it is not\n"
" necessary to set it.) type: string \n"
" --param_filename (The filename of param file, set param_file when\n"
" the model is combined formate. Otherwise, it is not necessary\n"
" to set it.) type: string \n"
" --input_shape (Set input shapes according to the model, separated by\n"
" colon and comma, such as 1,3,244,244) type: string\n"
" default: 1,3,224,224 \n"
" --input_img_path (The path of input image, if not set\n"
" input_img_path, the input will be 1.0.) type: string \n "
" --power_mode (Arm power mode: 0 for big cluster, 1 for little\n"
" cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
" --repeats (Repeats times) type: int32 default: 1 \n"
" --result_filename (Save the inference time to the file.) type: \n"
" string default: result.txt \n"
" --threads (Threads num) type: int32 default: 1 \n"
" --warmup (Warmup times) type: int32 default: 0 \n"
"Note that: \n"
" If load the optimized model, set optimized_model_path. Otherwise, \n"
" set model_dir, model_filename and param_filename according to \n"
" the model. \n";
LOG(INFO) << help_info;
}
int main(int argc, char** argv) {
// Check inputs
gflags::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_model_dir == "" || FLAGS_result_filename == "") {
LOG(INFO) << "please run ./benchmark_bin --help to obtain usage.";
bool is_opt_model = (FLAGS_optimized_model_path != "");
bool is_origin_model = (FLAGS_model_dir != "");
if (!is_origin_model && !is_opt_model) {
LOG(INFO) << "Input error, the model path should not be empty.\n";
print_usage();
exit(0);
}
if (FLAGS_model_dir.back() == '/') {
FLAGS_model_dir.pop_back();
}
std::size_t found = FLAGS_model_dir.find_last_of("/");
std::string model_name = FLAGS_model_dir.substr(found + 1);
std::string save_optimized_model_dir = FLAGS_model_dir + "_opt2";
// Get input shape
auto get_shape = [](const std::string& str_shape) -> std::vector<int64_t> {
std::vector<int64_t> shape;
std::string tmp_str = str_shape;
......@@ -208,19 +265,31 @@ int main(int argc, char** argv) {
}
return shape;
};
std::vector<int64_t> input_shape = get_shape(FLAGS_input_shape);
// Output optimized model if needed
if (FLAGS_run_model_optimize) {
paddle::lite_api::OutputOptModel(save_optimized_model_dir);
// Get model_name and run_model_path
std::string model_name;
std::string run_model_path;
if (is_origin_model) {
if (FLAGS_model_dir.back() == '/') {
FLAGS_model_dir.pop_back();
}
std::size_t found = FLAGS_model_dir.find_last_of("/");
model_name = FLAGS_model_dir.substr(found + 1);
std::string optimized_model_path = FLAGS_model_dir + "_opt2";
paddle::lite_api::OutputOptModel(optimized_model_path);
run_model_path = optimized_model_path + ".nb";
} else {
size_t found1 = FLAGS_optimized_model_path.find_last_of("/");
size_t found2 = FLAGS_optimized_model_path.find_last_of(".");
size_t len = found2 - found1 - 1;
model_name = FLAGS_optimized_model_path.substr(found1 + 1, len);
run_model_path = FLAGS_optimized_model_path;
}
#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
// Run inference using optimized model
std::string run_model_dir =
FLAGS_run_model_optimize ? save_optimized_model_dir : FLAGS_model_dir;
paddle::lite_api::Run(input_shape, run_model_dir, model_name);
// Run test
paddle::lite_api::Run(input_shape, run_model_path, model_name);
#endif
return 0;
}
......@@ -19,6 +19,7 @@
#include <string>
#include <utility>
#include <vector>
#include "lite/api/paddle_use_passes.h"
#include "lite/utils/io.h"
namespace paddle {
......@@ -291,10 +292,13 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
program_desc_ = desc;
// `inner_places` is used to optimize passes
std::vector<Place> inner_places = valid_places;
inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
inner_places.emplace_back(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
for (auto &valid_place : valid_places) {
inner_places.emplace_back(
Place(TARGET(kHost), valid_place.precision, valid_place.layout));
}
// Analysis whether the modle is quantized.
// For quantized model, add place(arm, int8) to inner_places
const std::vector<std::string> quant_dequant_op = {
"fake_quantize_abs_max",
"fake_quantize_range_abs_max",
......@@ -317,7 +321,8 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
}
}
if (is_quantized_model) {
inner_places.emplace_back(Place{TARGET(kARM), PRECISION(kInt8)});
inner_places.insert(inner_places.begin(),
Place{TARGET(kARM), PRECISION(kInt8)});
}
Program program(desc, scope_, inner_places);
......
......@@ -43,6 +43,7 @@ class LITE_API Predictor {
public:
// Create an empty predictor.
Predictor() { scope_ = std::make_shared<Scope>(); }
// Create a predictor with the weight variable scope set.
explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
: scope_(root_scope) {}
......
......@@ -20,19 +20,35 @@
#include "lite/core/device_info.h"
#include "lite/core/version.h"
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
#include <omp.h>
#include "lite/backends/x86/mklml.h"
#endif
namespace paddle {
namespace lite {
void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config_ = config;
auto places = config.valid_places();
std::vector<std::string> passes{};
#ifdef LITE_WITH_CUDA
Env<TARGET(kCUDA)>::Init();
// if kCUDA is included in valid places, it should be initialized first,
// otherwise skip this step.
for (auto &p : places) {
if (p.target == TARGET(kCUDA)) {
Env<TARGET(kCUDA)>::Init();
if (config_.multi_stream()) {
passes = {"multi_stream_analysis_pass"};
VLOG(3) << "add pass: " << passes[0];
}
break;
}
}
#endif
#ifdef LITE_WITH_MLU
Env<TARGET(kMLU)>::Init();
......@@ -43,8 +59,6 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
config.mlu_first_conv_std(),
config.mlu_input_layout());
#endif // LITE_WITH_MLU
auto places = config.valid_places();
std::vector<std::string> passes{};
auto use_layout_preprocess_pass =
config.model_dir().find("OPENCL_PRE_PRECESS");
VLOG(1) << "use_layout_preprocess_pass:" << use_layout_preprocess_pass;
......@@ -56,9 +70,8 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
raw_predictor_.Build(config, places, passes);
mode_ = config.power_mode();
threads_ = config.threads();
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
int num_threads = config.x86_math_library_num_threads();
int real_num_threads = num_threads > 1 ? num_threads : 1;
paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
......
......@@ -13,13 +13,10 @@
// limitations under the License.
#include "lite/api/light_api.h"
#include <algorithm>
#include <unordered_map>
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#include <algorithm>
namespace paddle {
namespace lite {
......@@ -32,7 +29,10 @@ void LightPredictor::Build(const std::string& lite_model_file,
LoadModelNaiveFromFile(lite_model_file, scope_.get(), &cpp_program_desc_);
}
// For weight quantization of post training, load the int8/16 weights
// for optimized model, and dequant it to fp32.
DequantizeWeight();
BuildRuntimeProgram(cpp_program_desc_);
PrepareFeedFetch();
}
......@@ -139,7 +139,12 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
// 1. Create op first
Program program(prog, scope_, {});
// 2. Create Instructs
// 2. Create Instructs
#ifdef LITE_WITH_OPENCL
using OpenCLContext = Context<TargetType::kOpenCL>;
std::unique_ptr<KernelContext> local_ctx(new KernelContext());
local_ctx->As<OpenCLContext>().InitOnce();
#endif
// Create the kernels of the target places, and filter out the specific
// kernel with the target alias.
......@@ -155,7 +160,18 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
return it->alias() == alias;
});
CHECK(it != kernels.end());
#ifdef LITE_WITH_OPENCL
if ((*it)->target() == TARGET(kOpenCL)) {
std::unique_ptr<KernelContext> ctx(new KernelContext());
(*local_ctx).As<OpenCLContext>().CopySharedTo(&ctx->As<OpenCLContext>());
(*it)->SetContext(std::move(ctx));
} else {
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
}
#else
(*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
#endif
insts.emplace_back(op, std::move(*it));
}
......@@ -166,58 +182,76 @@ void LightPredictor::BuildRuntimeProgram(const cpp::ProgramDesc& prog) {
}
void LightPredictor::DequantizeWeight() {
#define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < h; ++i) { \
for (int64_t j = 0; j < w; ++j) { \
fp_data[i * w + j] = scale_list[i] * int_data[i * w + j]; \
} \
#define PROCESS_CONV2D_DATA() \
for (int64_t i = 0; i < ch; ++i) { \
for (int64_t j = 0; j < offset; ++j) { \
fp_data[i * offset + j] = scale_list[i] * int_data[i * offset + j]; \
} \
}
#define PROCESS_FC_DATA() \
for (int i = 0; i < input_tensor->numel(); i++) { \
*fp_data = scale_list[0] * (*int_data); \
++fp_data; \
++int_data; \
#define PROCESS_FC_DATA() \
for (int64_t i = 0; i < chin; i++) { \
for (int64_t j = 0; j < chout; j++) { \
fp_data[i * chout + j] = scale_list[j] * int_data[i * chout + j]; \
} \
}
auto is_weight_quantized_op = [](const cpp::OpDesc* op_desc) {
bool result = false;
if (op_desc->HasAttr("quantization_type")) {
std::string type = op_desc->GetAttr<std::string>("quantization_type");
result = (type == "post_weight_abs_max") ||
(type == "post_weight_channel_wise_abs_max");
} else {
result = op_desc->HasAttr("quantize_weight_bits");
}
return result;
};
Tensor tmp_tensor;
CHECK(cpp_program_desc_.BlocksSize());
auto* main_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
for (size_t k = 0; k < main_block->OpsSize(); ++k) {
auto* op_desc = main_block->GetOp<cpp::OpDesc>(k);
if (op_desc->HasAttr("quantize_weight_bits")) { // weight quantized op
auto input_names = op_desc->input_vars();
for (auto& input_name : input_names) {
std::string input_scale_name = input_name + "_quant_scale";
if (op_desc->HasAttr(input_scale_name)) { // the input is quantized
auto input_tensor =
scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
tmp_tensor.CopyDataFrom(*input_tensor);
auto scale_list =
op_desc->GetAttr<std::vector<float>>(input_scale_name);
int quantize_weight_bits =
op_desc->GetAttr<int>("quantize_weight_bits");
float* fp_data = input_tensor->mutable_data<float>();
std::string op_type = op_desc->Type();
if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
int64_t h = input_tensor->dims()[0];
int64_t w = input_tensor->numel() / h;
CHECK_EQ(scale_list.size(), h);
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_CONV2D_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_CONV2D_DATA()
}
} else if (op_type == "fc" || op_type == "mul") {
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_FC_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_FC_DATA()
for (size_t i = 0; i < cpp_program_desc_.BlocksSize(); i++) {
auto* block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(i);
for (size_t k = 0; k < block->OpsSize(); ++k) {
auto* op_desc = block->GetOp<cpp::OpDesc>(k);
if (is_weight_quantized_op(op_desc)) {
auto input_names = op_desc->input_vars();
for (auto& input_name : input_names) {
std::string input_scale_name = input_name + "_quant_scale";
if (op_desc->HasAttr(input_scale_name)) { // the input is quantized
auto input_tensor =
scope_->FindVar(input_name)->GetMutable<lite::Tensor>();
tmp_tensor.CopyDataFrom(*input_tensor);
auto scale_list =
op_desc->GetAttr<std::vector<float>>(input_scale_name);
int quantize_weight_bits =
op_desc->GetAttr<int>("quantize_weight_bits");
CHECK(quantize_weight_bits == 8 || quantize_weight_bits == 16);
float* fp_data = input_tensor->mutable_data<float>();
std::string op_type = op_desc->Type();
if (op_type == "conv2d" || op_type == "depthwise_conv2d") {
int64_t ch = input_tensor->dims()[0];
int64_t offset = input_tensor->numel() / ch;
CHECK_EQ(scale_list.size(), ch);
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_CONV2D_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_CONV2D_DATA()
}
} else if (op_type == "fc" || op_type == "mul") {
int64_t chin = input_tensor->dims()[0];
int64_t chout = input_tensor->dims()[1];
CHECK_EQ(scale_list.size(), chout);
if (quantize_weight_bits == 8) {
const int8_t* int_data = tmp_tensor.data<int8_t>();
PROCESS_FC_DATA()
} else {
const int16_t* int_data = tmp_tensor.data<int16_t>();
PROCESS_FC_DATA()
}
}
}
}
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "lite/api/paddle_api.h"
namespace paddle {
namespace lite_api {
void RunModel() {
// 1. Set MobileConfig
MobileConfig mobile_config;
// 2. Create PaddlePredictor by MobileConfig
std::shared_ptr<PaddlePredictor> mobile_predictor =
CreatePaddlePredictor<MobileConfig>(mobile_config);
}
} // namespace lite_api
} // namespace paddle
......@@ -23,6 +23,7 @@
#include "kernel_src_map.h" // NOLINT
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/core/op_registry.h"
......@@ -108,6 +109,10 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") {
valid_places.emplace_back(TARGET(kRKNPU));
valid_places.emplace_back(
TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
} else if (target_repr == "mlu") {
valid_places.emplace_back(TARGET(kMLU));
} else {
......@@ -186,6 +191,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kFPGA",
"kNPU",
"kXPU",
"kRKNPU",
"kAny",
"kUnk"};
int maximum_optype_length = 0;
......@@ -250,16 +256,16 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu)`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n"
" `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n"
" `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
" Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu)`"
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`"
" Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl;
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/api/paddle_api.h"
#include "lite/core/context.h"
#include "lite/core/device_info.h"
#include "lite/core/target_wrapper.h"
#include "lite/core/tensor.h"
......@@ -203,6 +204,7 @@ void ConfigBase::set_threads(int threads) {
#endif
}
#ifdef LITE_WITH_MLU
void CxxConfig::set_mlu_core_version(lite_api::MLUCoreVersion core_version) {
mlu_core_version_ = core_version;
}
......@@ -227,12 +229,32 @@ lite_api::MLUCoreVersion CxxConfig::mlu_core_version() const {
int CxxConfig::mlu_core_number() const { return mlu_core_number_; }
DataLayoutType CxxConfig::mlu_input_layout() const { return mlu_input_layout_; }
bool CxxConfig::mlu_use_first_conv() const { return mlu_use_first_conv_; }
std::vector<float> CxxConfig::mlu_first_conv_mean() const {
const std::vector<float> &CxxConfig::mlu_first_conv_mean() const {
return mlu_first_conv_mean_;
}
std::vector<float> CxxConfig::mlu_first_conv_std() const {
const std::vector<float> &CxxConfig::mlu_first_conv_std() const {
return mlu_first_conv_std_;
}
#endif
void CxxConfig::set_xpu_workspace_l3_size_per_thread(int l3_size) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetWorkspaceL3Size(l3_size);
#else
LOG(WARNING) << "The invoking of the function "
"'set_xpu_workspace_l3_size_per_thread' is ignored, please "
"rebuild it with LITE_WITH_XPU=ON.";
#endif
}
void CxxConfig::set_xpu_dev_per_thread(int dev_no) {
#ifdef LITE_WITH_XPU
lite::Context<TargetType::kXPU>::SetDev(dev_no);
#else
LOG(WARNING) << "The invoking of the function 'set_xpu_dev_per_thread' is "
"ignored, please rebuild it with LITE_WITH_XPU=ON.";
#endif
}
// set model data in combined format, `set_model_from_file` refers to loading
// model from file, set_model_from_buffer refers to loading model from memory
......
......@@ -136,12 +136,17 @@ class LITE_API CxxConfig : public ConfigBase {
#ifdef LITE_WITH_X86
int x86_math_library_math_threads_ = 1;
#endif
#ifdef LITE_WITH_CUDA
bool multi_stream_{false};
#endif
#ifdef LITE_WITH_MLU
lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270};
int mlu_core_number_{1};
DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)};
bool mlu_use_first_conv_{false};
std::vector<float> mlu_first_conv_mean_;
std::vector<float> mlu_first_conv_std_;
#endif
public:
void set_valid_places(const std::vector<Place>& x) { valid_places_ = x; }
......@@ -169,20 +174,41 @@ class LITE_API CxxConfig : public ConfigBase {
return x86_math_library_math_threads_;
}
#endif
#ifdef LITE_WITH_CUDA
void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; }
int multi_stream() const { return multi_stream_; }
#endif
#ifdef LITE_WITH_MLU
// set MLU core version, which is used when compiling MLU kernels
void set_mlu_core_version(lite_api::MLUCoreVersion core_version);
// set MLU core number, which is used when compiling MLU kernels
void set_mlu_core_number(int core_number);
// set MLU input layout. User can specify layout of input data to be NHWC,
// default is NCHW
void set_mlu_input_layout(DataLayoutType layout);
// whether use MLU's first conv kernel. First conv is a special kernel
// provided by MLU, its input is uint8, and also needs two 3-dimentional
// vectors which save all inputs' mean and std values
void set_mlu_use_first_conv(bool use_first_conv);
// set the 3-dimentional mean vector used by MLU's first conv
void set_mlu_first_conv_mean(const std::vector<float>& mean);
// set the 3-dimentional std vector used by MLU's first conv
void set_mlu_first_conv_std(const std::vector<float>& std);
lite_api::MLUCoreVersion mlu_core_version() const;
int mlu_core_number() const;
DataLayoutType mlu_input_layout() const;
bool mlu_use_first_conv() const;
std::vector<float> mlu_first_conv_mean() const;
std::vector<float> mlu_first_conv_std() const;
const std::vector<float>& mlu_first_conv_mean() const;
const std::vector<float>& mlu_first_conv_std() const;
#endif
// XPU only, set the size of the workspace memory from L3 cache for the
// current thread.
void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00);
// XPU only, specify the target device ID for the current thread.
void set_xpu_dev_per_thread(int dev_no = 0);
};
/// MobileConfig is the config for the light weight predictor, it will skip
......
......@@ -18,20 +18,21 @@
*/
#pragma once
#define USE_LITE_OP(op_type__) \
extern int touch_op_##op_type__(); \
int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
touch_op_##op_type__();
// some platform-independent defintion
#include "lite/utils/macros.h"
#define USE_LITE_OP(op_type__) \
extern int touch_op_##op_type__(); \
int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();
#define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
extern int touch_##op_type__##target__##precision__##layout__##alias__(); \
int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
__attribute__((unused)) = \
touch_##op_type__##target__##precision__##layout__##alias__();
UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();
#define USE_MIR_PASS(name__) \
extern bool mir_pass_registry##name__##_fake(); \
static bool mir_pass_usage##name__ __attribute__((unused)) = \
#define USE_MIR_PASS(name__) \
extern bool mir_pass_registry##name__##_fake(); \
static bool mir_pass_usage##name__ UNUSED = \
mir_pass_registry##name__##_fake();
#define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
......@@ -72,7 +72,8 @@ const std::string& TargetToStr(TargetType target) {
"npu",
"xpu",
"bm",
"mlu"};
"mlu",
"rknpu"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......@@ -113,7 +114,8 @@ const std::string& TargetRepr(TargetType target) {
"kNPU",
"kXPU",
"kMLU",
"kBM"};
"kBM",
"kRKNPU"};
auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x];
......
......@@ -54,8 +54,9 @@ enum class TargetType : int {
kXPU = 9,
kBM = 10,
kMLU = 11,
kRKNPU = 12,
kAny = 6, // any target
NUM = 12, // number of fields.
NUM = 13, // number of fields.
};
enum class PrecisionType : int {
kUnk = 0,
......@@ -101,7 +102,10 @@ enum class ActivationType : int {
kTanh = 6,
kSwish = 7,
kExp = 8,
NUM = 9,
kAbs = 9,
kHardSwish = 10,
kReciprocal = 11,
NUM = 12,
};
static size_t PrecisionTypeLength(PrecisionType type) {
......
......@@ -42,11 +42,13 @@ USE_MIR_PASS(type_precision_cast_pass);
USE_MIR_PASS(type_layout_cast_pass);
USE_MIR_PASS(type_layout_cast_preprocess_pass);
USE_MIR_PASS(memory_optimize_pass);
USE_MIR_PASS(multi_stream_analysis_pass);
USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
USE_MIR_PASS(npu_subgraph_pass);
USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(subgraph_cast_display_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
......@@ -17,8 +17,12 @@ execute_process(
OUTPUT_VARIABLE PADDLE_LITE_COMMIT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
if(APPLE)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup_mac.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
else()
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
${CMAKE_CURRENT_BINARY_DIR}/setup.py)
endif()
add_subdirectory(pybind)
#add_subdirectory(interface)
......@@ -11,3 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
if os.name =='nt':
current_path = os.path.abspath(os.path.dirname(__file__))
third_lib_path = current_path + os.sep + 'libs'
os.environ['path'] = third_lib_path+ ';' + os.environ['path']
sys.path.insert(0, third_lib_path)
......@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH)
set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
endif()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
if(WIN32)
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(lite_pybind ${os_dependency_modules})
else()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
endif(WIN32)
if (LITE_ON_TINY_PUBLISH)
set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
endif()
......@@ -34,20 +34,27 @@ else:
# core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so']}
PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
# put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
if os.name != 'nt':
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
else:
PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll']
shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
# link lite.so to paddlelite.libs
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
if os.name != 'nt':
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
# remove unused paddle/libs/__init__.py
if os.path.isfile(LIB_PATH+'/__init__.py'):
......@@ -61,6 +68,14 @@ PACKAGE_DIR = {
'paddlelite': LITE_PATH
}
if os.name == 'nt':
# fix the path separator under windows
fix_package_dir = {}
for k, v in PACKAGE_DIR.items():
fix_package_dir[k] = v.replace('/', '\\')
PACKAGE_DIR = fix_package_dir
setup(
name='paddlelite',
version=PADDLELITE_VERSION,
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# module of pack whl installer for Paddle-lite
import shutil
import os
from setuptools import setup, Distribution
class BinaryDistribution(Distribution):
'binary distribution'
def has_ext_modules(foo):
return True
# get paddle-lite version, if it's not based on a release tag, we use commit id instead
PADDLELITE_COMMITE = "@PADDLE_LITE_COMMIT@"
PADDLELITE_TAG = "@PADDLE_LITE_TAG@"
if PADDLELITE_TAG == "":
PADDLELITE_VERSION = PADDLELITE_COMMITE
else:
PADDLELITE_VERSION = PADDLELITE_TAG
# core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so']}
# put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['libmklml.dylib', 'libiomp5.dylib']
# link lite.so to paddlelite.libs
COMMAND = "install_name_tool -id \"@loader_path/../libs/\" ${PADDLE_BINARY_DIR}\
/inference_lite_lib/python/install/lite/lite.so"
if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
# remove unused paddle/libs/__init__.py
if os.path.isfile(LIB_PATH+'/__init__.py'):
os.remove(LIB_PATH+'/__init__.py')
# set dir path of each package
PACKAGE_DIR = {
# The paddle.fluid.proto will be generated while compiling.
# So that package points to other directory.
'paddlelite.libs': LIB_PATH,
'paddlelite': LITE_PATH
}
setup(
name='paddlelite',
version=PADDLELITE_VERSION,
description='Paddle-Lite Library',
packages=['paddlelite', 'paddlelite.libs'],
package_dir=PACKAGE_DIR,
package_data=PACKAGE_DATA,
distclass=BinaryDistribution
)
......@@ -36,7 +36,8 @@ void TestModel(const std::vector<Place>& valid_places) {
predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
input_tensor->Resize(DDim(
std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
if (FLAGS_input_img_txt_path.empty()) {
......@@ -67,15 +68,13 @@ void TestModel(const std::vector<Place>& valid_places) {
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto* out = predictor.GetOutput(0);
ASSERT_EQ(out->dims().size(), 2);
ASSERT_EQ(out->dims()[0], 1);
ASSERT_EQ(out->dims()[1], 1000);
auto* out_data = out->data<float>();
auto out = predictor.GetOutputs();
FILE* fp = fopen("result.txt", "wb");
for (int i = 0; i < out->numel(); i++) {
fprintf(fp, "%f\n", out_data[i]);
for (int i = 0; i < out.size(); i++) {
auto* out_data = out[i]->data<float>();
for (int j = 0; j < out[i]->numel(); j++) {
fprintf(fp, "%f\n", out_data[j]);
}
}
fclose(fp);
}
......
......@@ -15,7 +15,12 @@
#pragma once
#include <gflags/gflags.h>
#if !defined(_WIN32)
#include <sys/time.h>
#else
#include <windows.h>
#include "lite/backends/x86/port.h"
#endif
#include <time.h>
#include <cmath>
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gflags/gflags.h>
#include <gtest/gtest.h>
#include <fstream>
#include <vector>
#include "lite/api/cxx_api.h"
#include "lite/api/paddle_use_kernels.h"
#include "lite/api/paddle_use_ops.h"
#include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h"
#include "lite/core/op_registry.h"
DEFINE_string(input_img_txt_path,
"",
"if set input_img_txt_path, read the img filename as input.");
namespace paddle {
namespace lite {
void TestModel(const std::vector<Place>& valid_places) {
lite::Predictor predictor;
std::vector<std::string> passes;
predictor.Build(FLAGS_model_dir,
FLAGS_model_dir + "/model",
FLAGS_model_dir + "/params",
valid_places,
passes);
auto* input_tensor = predictor.GetInput(0);
input_tensor->Resize(DDim(
std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
auto* data = input_tensor->mutable_data<float>();
auto item_size = input_tensor->dims().production();
if (FLAGS_input_img_txt_path.empty()) {
for (int i = 0; i < item_size; i++) {
data[i] = 1;
}
} else {
std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
if (!fs.is_open()) {
LOG(FATAL) << "open input_img_txt error.";
}
for (int i = 0; i < item_size; i++) {
fs >> data[i];
}
}
auto* image_tensor = predictor.GetInput(1);
image_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 2})));
data = image_tensor->mutable_data<float>();
data[0] = FLAGS_im_height;
data[1] = FLAGS_im_width;
for (int i = 0; i < FLAGS_warmup; ++i) {
predictor.Run();
}
auto start = GetCurrentUS();
for (int i = 0; i < FLAGS_repeats; ++i) {
predictor.Run();
}
LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << FLAGS_model_dir << ", threads num " << FLAGS_threads
<< ", warmup: " << FLAGS_warmup << ", repeats: " << FLAGS_repeats
<< ", spend " << (GetCurrentUS() - start) / FLAGS_repeats / 1000.0
<< " ms in average.";
auto out = predictor.GetOutputs();
FILE* fp = fopen("result.txt", "wb");
for (int i = 0; i < out.size(); i++) {
auto* out_data = out[i]->data<float>();
for (int j = 0; j < out[i]->numel(); j++) {
fprintf(fp, "%f\n", out_data[j]);
}
}
fclose(fp);
}
TEST(Yolov3, test_bm) {
std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kFloat)}});
TestModel(valid_places);
}
} // namespace lite
} // namespace paddle
......@@ -13,7 +13,9 @@
// limitations under the License.
#include <gflags/gflags.h>
#ifdef PADDLE_WITH_TESTING
#include <gtest/gtest.h>
#endif
#include <string>
#include <vector>
#include "lite/api/cxx_api.h"
......
......@@ -8,3 +8,4 @@ add_subdirectory(npu)
add_subdirectory(xpu)
add_subdirectory(mlu)
add_subdirectory(bm)
add_subdirectory(rknpu)
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/backends/arm/math/activation.h"
#include <algorithm>
#include <string>
#include "lite/backends/arm/math/funcs.h"
......@@ -711,6 +712,47 @@ void act_square<float>(const float* din, float* dout, int size, int threads) {
}
}
template <>
void act_hard_swish<float>(const float* din,
float* dout,
int size,
float threshold,
float scale,
float offset,
int threads) {
const float* ptr_in = din;
float* ptr_out = dout;
for (int i = 0; i < size; ++i) {
ptr_out[0] = std::min(std::max(0.f, ptr_in[0] + offset), threshold) *
ptr_in[0] / scale;
ptr_in++;
ptr_out++;
}
}
template <>
void act_reciprocal<float>(const float* din,
float* dout,
int size,
int threads) {
const float* ptr_in = din;
float* ptr_out = dout;
for (int i = 0; i < size; ++i) {
ptr_out[0] = 1.0 / ptr_in[0];
ptr_in++;
ptr_out++;
}
}
template <>
void act_abs<float>(const float* din, float* dout, int size, int threads) {
for (int i = 0; i < size; ++i) {
dout[0] = (din[0] > 0 ? din[0] : -din[0]);
din++;
dout++;
}
}
#ifdef LITE_WITH_TRAIN
template <>
void act_square_grad(const float* din,
......
......@@ -72,6 +72,20 @@ void act_rsqrt(const T* din, T* dout, int size, int threads);
template <typename T>
void act_square(const T* din, T* dout, int size, int threads);
template <typename T>
void act_hard_swish(const T* din,
T* dout,
int size,
float threshold,
float scale,
float offset,
int threads);
template <typename T>
void act_reciprocal(const T* din, T* dout, int size, int threads);
template <typename T>
void act_abs(const T* din, T* dout, int size, int threads);
#ifdef LITE_WITH_TRAIN
template <typename T>
void act_square_grad(
......
......@@ -16,46 +16,3 @@
#include <algorithm>
#include <limits>
#include <memory>
#include "lite/backends/arm/math/funcs.h"
namespace paddle {
namespace lite {
namespace arm {
namespace math {
void concat_func(const std::vector<lite::Tensor *> &input,
const int axis,
lite::Tensor *output) {
int64_t concat_input_size = 1;
int64_t num_cancats = 1;
auto dim_0 = input[0]->dims();
size_t num = input.size();
for (int i = axis + 1; i < dim_0.size(); i++) {
concat_input_size *= dim_0[i];
}
for (int i = 0; i < axis; i++) {
num_cancats *= dim_0[i];
}
float *dst_ptr = output->mutable_data<float>();
const int out_concat_axis = output->dims()[axis];
int64_t offset_concat_axis = 0;
int64_t out_sum = out_concat_axis * concat_input_size;
for (int n = 0; n < num; n++) {
auto dims = input[n]->dims();
const float *src_ptr = input[n]->data<float>();
int64_t in_concat_axis = dims[axis];
float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
int64_t in_sum = in_concat_axis * concat_input_size;
for (int i = 0; i < num_cancats; i++) {
std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
dout_ptr += out_sum;
src_ptr += in_sum;
}
offset_concat_axis += in_concat_axis;
}
}
} // namespace math
} // namespace arm
} // namespace lite
} // namespace paddle
......@@ -25,9 +25,39 @@ namespace lite {
namespace arm {
namespace math {
void concat_func(const std::vector<lite::Tensor *> &input,
template <typename T>
void concat_func(const std::vector<lite::Tensor*>& input,
const int axis,
lite::Tensor *output);
lite::Tensor* output) {
size_t num = input.size();
auto dim_0 = input[0]->dims();
int64_t concat_input_size = 1;
int64_t num_cancats = 1;
for (int i = axis + 1; i < dim_0.size(); i++) {
concat_input_size *= dim_0[i];
}
for (int i = 0; i < axis; i++) {
num_cancats *= dim_0[i];
}
auto* dst_ptr = output->mutable_data<T>();
const int out_concat_axis = output->dims()[axis];
int64_t offset_concat_axis = 0;
int64_t out_sum = out_concat_axis * concat_input_size;
for (int n = 0; n < num; n++) {
auto dims = input[n]->dims();
auto* src_ptr = input[n]->data<T>();
int64_t in_concat_axis = dims[axis];
auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
int64_t in_sum = in_concat_axis * concat_input_size;
for (int i = 0; i < num_cancats; i++) {
std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum);
dout_ptr += out_sum;
src_ptr += in_sum;
}
offset_concat_axis += in_concat_axis;
}
}
} // namespace math
} // namespace arm
......
......@@ -302,10 +302,10 @@ void elementwise_add_grad_broadcast<float>(const float* dout_grad,
int pre,
int n,
int post) {
if (x_grad) {
if (x_grad != nullptr) {
elementwise_add_grad(dout_grad, x_grad, pre * n * post);
}
if (y_grad) {
if (y_grad != nullptr) {
memset(y_grad, 0, n * sizeof(float));
#pragma omp parallel for
for (int i = 0; i < pre; ++i) {
......@@ -582,10 +582,10 @@ void elementwise_sub_grad<float>(const float* dout_grad,
float* x_grad,
float* y_grad,
int num) {
if (x_grad) {
if (x_grad != nullptr) {
elementwise_add_grad(dout_grad, x_grad, num);
}
if (y_grad) {
if (y_grad != nullptr) {
int cnt = num >> 4;
int remain = num & 0x0f;
float32x4_t minus = vdupq_n_f32(-1);
......@@ -624,10 +624,10 @@ void elementwise_sub_grad_broadcast<float>(const float* dout_grad,
int pre,
int n,
int post) {
if (x_grad) {
if (x_grad != nullptr) {
elementwise_add_grad(dout_grad, x_grad, pre * n * post);
}
if (y_grad) {
if (y_grad != nullptr) {
memset(y_grad, 0, n * sizeof(float));
#pragma omp parallel for
for (int i = 0; i < pre; ++i) {
......
......@@ -198,6 +198,23 @@ void reduce_mean_hw<float>(const float* src,
reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
}
template <>
void mean_grad<float>(const float* out_grad, float* in_grad, int size) {
float grad = out_grad[0] / size;
float32x4_t grad_v = vdupq_n_f32(grad);
int loop = size >> 2;
int remain = size & 3;
#pragma omp parallel for
for (int i = 0; i < loop; ++i) {
vst1q_f32(in_grad, grad_v);
in_grad += 4;
}
for (int i = 0; i < remain; ++i) {
in_grad[i] = grad;
}
}
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -83,6 +83,9 @@ void reduce_mean_all(const T* src,
int height_in,
int width_in);
template <typename T>
void mean_grad(const T* out_grad, T* in_grad, int size);
} // namespace math
} // namespace arm
} // namespace lite
......
......@@ -5,5 +5,7 @@ get_property(cuda_deps GLOBAL PROPERTY CUDA_MODULES)
nv_library(target_wrapper_cuda SRCS target_wrapper.cc DEPS ${cuda_deps})
nv_library(cuda_blas SRCS blas.cc DEPS ${cuda_deps})
lite_cc_library(cuda_context SRCS context.cc DEPS device_info)
add_subdirectory(math)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/context.h"
namespace paddle {
namespace lite {} // namespace lite
} // namespace paddle
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/backends/cuda/blas.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/backends/cuda/target_wrapper.h"
#include "lite/core/device_info.h"
namespace paddle {
namespace lite {
template <TargetType Type>
class Context;
using CUDAContext = Context<TargetType::kCUDA>;
// Only works with CUDA kernels.
template <>
class Context<TargetType::kCUDA> {
public:
typename Env<TargetType::kCUDA>::Devs& devs =
Env<TargetType::kCUDA>::Global();
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {
if (devs.size() > 0) {
cublas_fp32_ = std::make_shared<lite::cuda::Blas<float>>();
} else {
LOG(INFO) << "No cuda device(s) found, CUDAContext init failed.";
}
}
void Init(int dev_id, int exec_stream_id = 0, int io_stream_id = 0) {
CHECK_GT(devs.size(), 0UL)
<< "Env is not initialized or current target is not exit!";
if (dev_id >= static_cast<int>(devs.size())) {
LOG(WARNING) << "device index exceeds the number of devices, set to "
"default device(0)!";
device_id_ = 0;
} else {
device_id_ = dev_id;
}
if (io_stream_id >= devs[dev_id].max_stream()) {
LOG(WARNING) << "data stream index exceeds the maximum stream number, "
"set to default stream(0)!";
io_stream_id = 0;
}
if (exec_stream_id >= devs[dev_id].max_stream()) {
LOG(WARNING) << "exec stream index exceeds the maximum stream number, "
"set to default stream(0)!";
exec_stream_id = 0;
}
exec_stream_ = devs[dev_id].exec_streams()[exec_stream_id];
io_stream_ = devs[dev_id].io_streams()[io_stream_id];
exec_stream_id_ = exec_stream_id;
io_stream_id_ = io_stream_id;
need_sync_ = false;
}
void CopySharedTo(CUDAContext* ctx) {
CHECK(ctx);
CHECK(cublas_fp32_) << "cublas_fp32 should be set first";
ctx->cublas_fp32_ = cublas_fp32_;
}
const cudaStream_t& exec_stream() const { return exec_stream_; }
void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
const cudaStream_t& io_stream() const { return io_stream_; }
void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }
void SetCuBlasFP32(std::shared_ptr<cuda::Blas<float>> cublas_fp32) {
cublas_fp32_ = cublas_fp32;
}
const std::vector<cudaEvent_t>& input_events() { return input_events_; }
void SetInputEvents(const std::vector<cudaEvent_t>& input_events) {
input_events_.clear();
input_events_.assign(input_events.begin(), input_events.end());
}
const std::vector<cudaEvent_t>& output_events() { return output_events_; }
void SetOutputEvents(const std::vector<cudaEvent_t>& output_events) {
output_events_.clear();
output_events_.assign(output_events.begin(), output_events.end());
}
std::vector<cudaStream_t> all_exec_streams() {
int dev_id = TargetWrapper<TargetType::kCUDA>::GetCurDevice();
return devs[dev_id].exec_streams();
}
void SetSyncStreams(const std::vector<int>& nums) {
sync_streams_.clear();
std::vector<cudaStream_t> exec_streams = all_exec_streams();
for (size_t i = 0; i < nums.size(); ++i) {
CHECK(nums[i] >= 0 && nums[i] < static_cast<int>(exec_streams.size()))
<< "streams id is not valid";
sync_streams_.push_back(exec_streams[nums[i]]);
}
InitSyncEvents(nums.size());
}
void InitSyncEvents(const int num) {
sync_events_.clear();
for (int i = 0; i < num; ++i) {
cudaEvent_t eve;
TargetWrapperCuda::CreateEventWithFlags(&eve);
sync_events_.push_back(eve);
}
}
void SetNeedSync(bool sync) { need_sync_ = sync; }
bool need_sync() const { return need_sync_; }
void Sync() {
CHECK_EQ(sync_streams_.size(), sync_events_.size());
for (size_t i = 0; i < sync_events_.size(); ++i) {
TargetWrapperCuda::RecordEvent(sync_events_[i], sync_streams_[i]);
TargetWrapperCuda::StreamSync(exec_stream_, sync_events_[i]);
}
}
std::string name() const { return "CUDAContext"; }
CUDAContext& operator=(const CUDAContext& context) {
this->Init(
context.device_id_, context.exec_stream_id_, context.io_stream_id_);
cublas_fp32_ = const_cast<CUDAContext&>(context).cublas_fp32();
return *this;
}
private:
int device_id_;
// overall information
int exec_stream_id_;
int io_stream_id_;
cudaStream_t exec_stream_;
cudaStream_t io_stream_;
// not thread-safe, should allocate for each thread.
std::shared_ptr<cuda::Blas<float>> cublas_fp32_;
// kernel information
std::vector<cudaEvent_t> input_events_;
std::vector<cudaEvent_t> output_events_;
// multi stream sync.
std::vector<cudaStream_t> sync_streams_;
std::vector<cudaEvent_t> sync_events_;
bool need_sync_;
};
} // namespace lite
} // namespace paddle
......@@ -29,6 +29,7 @@ enum class BinaryOperation {
kADD = 0,
kMUL = 1,
kDIV = 2,
kSUB = 3,
};
template <typename T>
......@@ -41,6 +42,7 @@ __device__ __forceinline__ float binary_calc(float x,
if (type == BinaryOperation::kADD) return x + y;
if (type == BinaryOperation::kMUL) return x * y;
if (type == BinaryOperation::kDIV) return x / y;
if (type == BinaryOperation::kSUB) return x - y;
}
template <typename T>
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -54,19 +51,20 @@ cl::Program &CLContext::GetProgram(const std::string &file_name,
void CLContext::AddKernel(const std::string &kernel_name,
const std::string &file_name,
const std::string &options) {
const std::string &options,
const std::string &time_stamp) {
cl_int status{CL_SUCCESS};
VLOG(3) << " --- to get program " << file_name << " --- ";
auto program = GetProgram(file_name, options);
VLOG(3) << " --- end get program --- ";
VLOG(3) << " --- to create kernel: " << kernel_name << " --- ";
std::unique_ptr<cl::Kernel> kernel(
std::shared_ptr<cl::Kernel> kernel(
new cl::Kernel(program, kernel_name.c_str(), &status));
CL_CHECK_FATAL(status);
VLOG(3) << " --- end create kernel --- ";
kernels_.emplace_back(std::move(kernel));
STL::stringstream kernel_key;
kernel_key << kernel_name << options;
kernel_key << kernel_name << options << time_stamp;
kernel_offset_[kernel_key.str()] = kernels_.size() - 1;
}
......@@ -121,14 +119,53 @@ cl::NDRange CLContext::DefaultWorkSize(const CLImage &image) {
}
}
cl::NDRange CLContext::LocalWorkSizeTurn(cl::NDRange global_work_size,
size_t max_work_size,
int divisor) {
int preferred_lws = 0;
#if 1
auto gws0 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws2 = global_work_size[2];
#else
auto gws2 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws0 = global_work_size[2];
#endif
if (divisor > 1) {
max_work_size /= divisor;
}
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (gws1 > max_work_size && max_work_size > 0) {
gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
}
while (gws2 * gws1 > max_work_size && max_work_size > 0) {
gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
}
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
}
#if 1
return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(gws1),
static_cast<size_t>(gws2)};
#else
return cl::NDRange{static_cast<size_t>(gws2),
static_cast<size_t>(gws1),
static_cast<size_t>(gws0)};
#endif
}
cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
size_t max_work_size) {
int preferred_lws = 0;
int divisor = 2;
auto tmp0 = global_work_size[0];
auto tmp1 = global_work_size[1];
auto tmp2 = global_work_size[2];
auto gws0 = global_work_size[0];
auto gws1 = global_work_size[1];
auto gws2 = global_work_size[2];
if (divisor > 1) {
max_work_size /= divisor;
......@@ -136,18 +173,18 @@ cl::NDRange CLContext::LocalWorkSize(cl::NDRange global_work_size,
if (preferred_lws > 0 && preferred_lws <= max_work_size) {
max_work_size = preferred_lws;
}
while (tmp1 > max_work_size && max_work_size > 0) {
tmp1 = tmp1 % 2 == 0 ? tmp1 / 2 : 1;
while (gws1 > max_work_size && max_work_size > 0) {
gws1 = gws1 % 2 == 0 ? gws1 / 2 : 1;
}
while (tmp2 * tmp1 > max_work_size && max_work_size > 0) {
tmp2 = tmp2 % 2 == 0 ? tmp2 / 2 : 1;
while (gws2 * gws1 > max_work_size && max_work_size > 0) {
gws2 = gws2 % 2 == 0 ? gws2 / 2 : 1;
}
while (tmp0 * tmp1 * tmp2 > max_work_size && max_work_size > 0) {
tmp0 = tmp0 % 2 == 0 ? tmp0 / 2 : 1;
while (gws0 * gws1 * gws2 > max_work_size && max_work_size > 0) {
gws0 = gws0 % 2 == 0 ? gws0 / 2 : 1;
}
return cl::NDRange{static_cast<size_t>(tmp0),
static_cast<size_t>(tmp1),
static_cast<size_t>(tmp2)};
return cl::NDRange{static_cast<size_t>(gws0),
static_cast<size_t>(gws1),
static_cast<size_t>(gws2)};
}
} // namespace lite
......
......@@ -27,6 +27,21 @@ namespace lite {
class CLContext {
public:
~CLContext() {
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
// Note(ysh329): Don't need `clReleaseKernel`
kernels_[kidx].reset();
}
kernels_.clear();
kernel_offset_.clear();
for (auto &p : programs_) {
// Note(ysh329): Dont't need `clReleaseProgram`
p.second.reset();
}
programs_.clear();
LOG(INFO) << "release cl::Program, cl::Kernel finished.";
}
cl::CommandQueue &GetCommandQueue();
cl::Context &GetContext();
......@@ -36,7 +51,8 @@ class CLContext {
void AddKernel(const std::string &kernel_name,
const std::string &file_name,
const std::string &options = "");
const std::string &options = "",
const std::string &time_stamp = "");
cl::Kernel &GetKernel(const int index);
......@@ -46,9 +62,15 @@ class CLContext {
cl::NDRange LocalWorkSize(cl::NDRange global_work_size, size_t max_work_size);
cl::NDRange LocalWorkSizeTurn(cl::NDRange global_work_size,
size_t max_work_size,
int divitor = 2);
// cl::NDRange LocalWorkSizeConv1x1(cl::NDRange global_work_size,
// size_t max_work_size);
private:
std::unordered_map<std::string, std::unique_ptr<cl::Program>> programs_;
std::vector<std::unique_ptr<cl::Kernel>> kernels_;
std::vector<std::shared_ptr<cl::Kernel>> kernels_;
std::map<std::string, int> kernel_offset_;
};
......
......@@ -55,17 +55,20 @@ __kernel void relu6(__read_only image2d_t input,
__kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output,
__private const float threshold,
__private const float scale) {
__private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = 1 / (1 + exp(-in));
CL_DTYPE4 out;
out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y)));
out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z)));
out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w)));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
}
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cl_common.h>
__kernel void decode_center_size(__read_only image2d_t prior_box_image,
__read_only image2d_t prior_box_var_image,
__read_only image2d_t target_box_image,
__write_only image2d_t output_image,
__private const int out_C,
__private const int out_H){
const int out_c = get_global_id(0);
const int out_nh = get_global_id(1);
const int out_h = out_nh % out_H;
const int out_n = 1;
const int prior_box_n = 1;
const int prior_box_c = 0;
const int prior_box_h = out_h;
const int prior_box_var_n = 1;
const int prior_box_var_c = 0;
const int prior_box_var_h = out_h;
const int target_box_n = 1;
const int target_box_c = out_c;
const int target_box_h = out_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
int2 prior_box_pos;
int2 prior_box_var_pos;
int2 target_box_pos;
int2 output_pos;
prior_box_pos.x = prior_box_c * 4;
prior_box_pos.y = prior_box_n * prior_box_h;
prior_box_var_pos.x = prior_box_var_c * 4;
prior_box_var_pos.y = prior_box_var_n * prior_box_var_h;
target_box_pos.x = target_box_c * 4;
target_box_pos.y = target_box_n * target_box_h;
output_pos.x = out_c * 4;
output_pos.y = out_n * out_h;
CL_DTYPE4 prior_box_input[4];
CL_DTYPE4 prior_box_var_input[4];
CL_DTYPE4 target_box_input[4];
prior_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
(int2)(prior_box_pos.x + 0, prior_box_pos.y));
prior_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
(int2)(prior_box_pos.x + 1, prior_box_pos.y));
prior_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
(int2)(prior_box_pos.x + 2, prior_box_pos.y));
prior_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_image, sampler,
(int2)(prior_box_pos.x + 3, prior_box_pos.y));
prior_box_var_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
(int2)(prior_box_var_pos.x + 0, prior_box_var_pos.y));
prior_box_var_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
(int2)(prior_box_var_pos.x + 1, prior_box_var_pos.y));
prior_box_var_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
(int2)(prior_box_var_pos.x + 2, prior_box_var_pos.y));
prior_box_var_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, prior_box_var_image, sampler,
(int2)(prior_box_var_pos.x + 3, prior_box_var_pos.y));
target_box_input[0] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
(int2)(target_box_pos.x + 0,target_box_pos.y));
target_box_input[1] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
(int2)(target_box_pos.x + 1, target_box_pos.y));
target_box_input[2] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
(int2)(target_box_pos.x + 2, target_box_pos.y));
target_box_input[3] = READ_IMG_TYPE(CL_DTYPE_CHAR, target_box_image, sampler,
(int2)(target_box_pos.x + 3, target_box_pos.y));
CL_DTYPE prior_box_width = prior_box_input[2].x - prior_box_input[0].x;
CL_DTYPE prior_box_height = prior_box_input[3].x - prior_box_input[1].x;
CL_DTYPE prior_box_center_x = (prior_box_input[2].x + prior_box_input[0].x)/(CL_DTYPE)2;
CL_DTYPE prior_box_center_y = (prior_box_input[3].x + prior_box_input[1].x)/(CL_DTYPE)2;
CL_DTYPE4 target_box_center_x;
CL_DTYPE4 target_box_center_y;
CL_DTYPE4 target_box_width;
CL_DTYPE4 target_box_height;
CL_DTYPE4 output[4];
output[0] = 0.0f;
output[1] = 0.0f;
output[2] = 0.0f;
output[3] = 0.0f;
target_box_center_x.x = prior_box_var_input[0].x * target_box_input[0].x * prior_box_width + prior_box_center_x;
target_box_center_y.x = prior_box_var_input[1].x * target_box_input[1].x * prior_box_height + prior_box_center_y;
target_box_width.x = exp(prior_box_var_input[2].x * target_box_input[2].x) * prior_box_width;
target_box_height.x = exp(prior_box_var_input[3].x * target_box_input[3].x) * prior_box_height;
output[0].x = target_box_center_x.x - target_box_width.x/(half)2;
output[1].x = target_box_center_y.x - target_box_height.x/(half)2;
output[2].x = target_box_center_x.x + target_box_width.x/(half)2;
output[3].x = target_box_center_y.x + target_box_height.x/(half)2;
if(out_C - out_c * 4 >= 2){
target_box_center_x.y = prior_box_var_input[0].x * target_box_input[0].y * prior_box_width + prior_box_center_x;
target_box_center_y.y = prior_box_var_input[1].x * target_box_input[1].y * prior_box_height + prior_box_center_y;
target_box_width.y = exp(prior_box_var_input[2].x * target_box_input[2].y) * prior_box_width;
target_box_height.y = exp(prior_box_var_input[3].x * target_box_input[3].y) * prior_box_height;
output[0].y = target_box_center_x.y - target_box_width.y/(half)2;
output[1].y = target_box_center_y.y - target_box_height.y/(half)2;
output[2].y = target_box_center_x.y + target_box_width.y/(half)2;
output[3].y = target_box_center_y.y + target_box_height.y/(half)2;
}
if(out_C - out_c * 4 >= 3){
target_box_center_x.z = prior_box_var_input[0].x * target_box_input[0].z * prior_box_width + prior_box_center_x;
target_box_center_y.z = prior_box_var_input[1].x * target_box_input[1].z * prior_box_height + prior_box_center_y;
target_box_width.z = exp(prior_box_var_input[2].x * target_box_input[2].z) * prior_box_width;
target_box_height.z = exp(prior_box_var_input[3].x * target_box_input[3].z) * prior_box_height;
output[0].z = target_box_center_x.z - target_box_width.z/(half)2;
output[1].z = target_box_center_y.z - target_box_height.z/(half)2;
output[2].z = target_box_center_x.z + target_box_width.z/(half)2;
output[3].z = target_box_center_y.z + target_box_height.z/(half)2;
}
if(out_C - out_c * 4 >= 4){
target_box_center_x.w = prior_box_var_input[0].x * target_box_input[0].w * prior_box_width + prior_box_center_x;
target_box_center_y.w = prior_box_var_input[1].x * target_box_input[1].w * prior_box_height + prior_box_center_y;
target_box_width.w = exp(prior_box_var_input[2].x * target_box_input[2].w) * prior_box_width;
target_box_height.w = exp(prior_box_var_input[3].x * target_box_input[3].w) * prior_box_height;
output[0].w = target_box_center_x.w - target_box_width.w/(half)2;
output[1].w = target_box_center_y.w - target_box_height.w/(half)2;
output[2].w = target_box_center_x.w + target_box_width.w/(half)2;
output[3].w = target_box_center_y.w + target_box_height.w/(half)2;
}
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 0, output_pos.y), output[0]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 1, output_pos.y), output[1]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 2, output_pos.y), output[2]);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output_image, (int2)(output_pos.x + 3, output_pos.y), output[3]);
}
......@@ -30,6 +30,143 @@ __kernel void elementwise_mul(__global image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
__kernel void channel_mul(__global image2d_t input,
__global image2d_t bias,
__write_only image2d_t outputImage,
int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x / w;
coords_bias.y = 0;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
CL_DTYPE4 output = in * biase;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
// etc : 1 1 1 72
// run time Y [value,0,0,0] * 72
__kernel void channel_mul_d2(__global image2d_t input,
__global image2d_t bias,
__write_only image2d_t outputImage,
int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias0;
int2 coords_bias1;
int2 coords_bias2;
int2 coords_bias3;
/* if (x == 0 && y == 0) {
CL_DTYPE4 b = (CL_DTYPE4){0, 0, 0, 0};
#define PPI(j, k) \
b = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, (int2){j, k}); \
printf("bias(%d,%d)={ %f , %f , %f , %f }\n ", j, k, convert_float(b.x), \
convert_float(b.y), convert_float(b.z), convert_float(b.w));
for (int i = 0; i < 73; ++i) {
PPI(i, 0);
}
#undef PPI
}*/
coords_bias0.x = x / w * 4;
coords_bias0.y = 0;
coords_bias1.x = x / w * 4 + 1;
coords_bias1.y = 0;
coords_bias2.x = x / w * 4 + 2;
coords_bias2.y = 0;
coords_bias3.x = x / w * 4 + 3;
coords_bias3.y = 0;
CL_DTYPE4 biase0 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias0);
CL_DTYPE4 biase1 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias1);
CL_DTYPE4 biase2 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias2);
CL_DTYPE4 biase3 = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias3);
/* if (x == 0 && y == 0) {
printf("bias0={ %f , %f , %f , %f }\n ",
convert_float(biase0.x), convert_float(biase0.y),
convert_float(biase0.z), convert_float(biase0.w));
printf("bias1={ %f , %f , %f , %f }\n ",
convert_float(biase1.x), convert_float(biase1.y),
convert_float(biase1.z), convert_float(biase1.w));
printf("bias2={ %f , %f , %f , %f }\n ",
convert_float(biase2.x), convert_float(biase2.y),
convert_float(biase2.z), convert_float(biase2.w));
printf("bias3={ %f , %f , %f , %f }\n ",
convert_float(biase3.x), convert_float(biase3.y),
convert_float(biase3.z), convert_float(biase3.w));
}*/
CL_DTYPE4 biase = {biase0.x, biase1.x, biase2.x, biase3.x};
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 output = mad(in, biase, 0);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
// c 1 1
__kernel void channel_mul_d3(__global image2d_t input,
__global image2d_t bias,
__write_only image2d_t outputImage,
int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x / w;
coords_bias.y = 0;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
CL_DTYPE4 output = in * biase;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
__kernel void channel_mul_d4(__global image2d_t input,
__global image2d_t bias,
__write_only image2d_t outputImage, int w) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
int2 coords_bias;
coords_bias.x = x / w;
coords_bias.y = 0;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords_bias);
CL_DTYPE4 output = in * biase;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
#if 0 // TODO(ysh329): comment code below
__kernel void elementwise_mul(__global image2d_t input,
__global image2d_t bias,
__write_only image2d_t outputImage) {
int x = get_global_id(0);
int y = get_global_id(1);
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coords;
coords.x = x;
coords.y = y;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, coords);
CL_DTYPE4 biase = READ_IMG_TYPE(CL_DTYPE_CHAR, bias, sampler, coords);
CL_DTYPE4 output = in * biase;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
__kernel void channel_mul_d1(__read_only image2d_t input,
__read_only image2d_t bias,
......@@ -184,4 +321,4 @@ __kernel void channel_mul_d4(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, outputImage, coords, output);
}
#endif
......@@ -14,14 +14,127 @@ limitations under the License. */
#include <cl_common.h>
// onnx/pytorch instancenorm by lijian
__kernel void instance_norm_onnx(__private const int in_width,
__private const int in_height,
__private const int in_c_group,
__private const int local_work_size_x,
__private const int local_work_size_y,
__private const float epsilon,
__read_only image2d_t input,
__write_only image2d_t output) {
const int out_cn = get_global_id(0);
const int n = out_cn / in_c_group;
const int c = out_cn % in_c_group;
const int w = get_local_id(1);
const int h = get_local_id(2);
const int local_id = w * local_work_size_y + h;
const int local_total_size = local_work_size_x * local_work_size_y;
__kernel void instance_norm(__read_only image2d_t input,
__write_only image2d_t output,
__read_only image2d_t scale,
__read_only image2d_t bias,
const float epsilon,
const int in_h,
const int in_w){
const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef LOCAL_MEM_128
__local float4 shared_mem[128];
#elif defined(LOCAL_MEM_64)
__local float4 shared_mem[64];
#else
__local float4 shared_mem[256];
#endif
int xOffset = c * in_width;
int yOffset = n * in_height;
float4 sum = 0.0f;
for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
sum += read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex));
}
}
shared_mem[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0.0f;
if (local_id < 32) {
for (int i = local_id + 32; i < local_total_size; i += 32) {
sum += shared_mem[i];
}
}
shared_mem[local_id] += sum;
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0.0f;
if (local_id == 0) {
int top = min(32, local_total_size);
for (int i = 0; i < top; i += 1) {
sum += shared_mem[i];
}
shared_mem[0] = sum / (in_width * in_height);
}
barrier(CLK_LOCAL_MEM_FENCE);
const float4 mean_val = shared_mem[0];
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0.0f;
for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
float4 temp = read_imagef(input, sampler, (int2)(xOffset + xIndex, yOffset + yIndex)) - mean_val;
sum += temp * temp;
}
}
shared_mem[local_id] = sum;
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0.0f;
if (local_id < 32) {
for (int i = local_id + 32; i < local_total_size; i += 32) {
sum += shared_mem[i];
}
}
shared_mem[local_id] += sum;
barrier(CLK_LOCAL_MEM_FENCE);
sum = 0.0f;
if (local_id == 0) {
int top = min(32, local_total_size);
for (int i = 0; i < top; i += 1) {
sum += shared_mem[i];
}
shared_mem[0] = sum / (in_width * in_height);
}
barrier(CLK_LOCAL_MEM_FENCE);
const float4 sigma = sqrt(shared_mem[0] + (float4)(epsilon));
float4 s = 1 / sigma;
for (int xIndex = w; xIndex < in_width; xIndex += local_work_size_x) {
for (int yIndex = h; yIndex < in_height; yIndex += local_work_size_y) {
int2 intout_pos = (int2)(xOffset + xIndex, yOffset + yIndex);
float4 in_val = read_imagef(input, sampler, intout_pos);
half4 out_val = convert_half4((in_val - mean_val) * s);
#ifdef RELU
out_val = activation(out_val);
#endif
write_imageh(output, intout_pos, out_val);
}
}
}
// paddle instancenorm by zhangxi
__kernel void instance_norm_paddle(__read_only image2d_t input,
__write_only image2d_t output,
__read_only image2d_t scale,
__read_only image2d_t bias,
const float epsilon,
const int in_h,
const int in_w){
__local CL_DTYPE4 saved_mean[1024];
__local CL_DTYPE4 saved_variance[1024];
const int lid = get_local_id(0);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -29,13 +26,15 @@ CLRuntime* CLRuntime::Global() {
CLRuntime::~CLRuntime() {
if (command_queue_ != nullptr) {
command_queue_->flush();
command_queue_->finish();
}
// For controlling the destruction order:
// For controlling the destruction order
command_queue_.reset();
context_.reset();
device_.reset();
platform_.reset();
device_info_.clear();
}
bool CLRuntime::Init() {
......@@ -128,6 +127,12 @@ bool CLRuntime::InitializePlatform() {
}
bool CLRuntime::InitializeDevice() {
// ===================== BASIC =====================
// CL_DEVICE_TYPE_GPU
// CL_DEVICE_NAME
// CL_DEVICE_SUPPORT
// CL_DEVICE_MAX_COMPUTE_UNITS
// CL_DEVICE_MAX_CLOCK_FREQUENCY
std::vector<cl::Device> all_devices;
status_ = platform_->getDevices(CL_DEVICE_TYPE_GPU, &all_devices);
CL_CHECK_ERROR(status_);
......@@ -140,27 +145,153 @@ bool CLRuntime::InitializeDevice() {
auto device_name = device_->getInfo<CL_DEVICE_NAME>();
LOG(INFO) << "Using device: " << device_name;
cl_device_type device_type = device_->getInfo<CL_DEVICE_TYPE>();
auto device_type_to_str = [](cl_device_type t) -> std::string {
std::string t_str{""};
switch (t) {
case CL_DEVICE_TYPE_CPU:
t_str = "CPU";
break;
case CL_DEVICE_TYPE_GPU:
t_str = "GPU";
break;
case CL_DEVICE_TYPE_ACCELERATOR:
t_str = "Accelerator";
break;
case CL_DEVICE_TYPE_DEFAULT:
t_str = "Default";
break;
default:
t_str = "Unknown";
}
return t_str;
};
LOG(INFO) << "device_type:" << device_type_to_str(device_type);
device_info_["CL_DEVICE_TYPE"] = device_type;
auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
LOG(INFO) << "The chosen device has " << max_units << " compute units.";
device_info_["CL_DEVICE_MAX_COMPUTE_UNITS"] = max_units;
auto max_clock_freq = device_->getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>();
LOG(INFO) << "CL_DEVICE_MAX_CLOCK_FREQUENCY:" << max_clock_freq;
device_info_["CL_DEVICE_MAX_CLOCK_FREQUENCY"] = max_clock_freq;
// ===================== MEMORY =====================
// CL_DEVICE_LOCAL_MEM_SIZE
// CL_DEVICE_GLOBAL_MEM_CACHE_SIZE
// CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
// CL_DEVICE_GLOBAL_MEM_SIZE
auto local_mem_kb =
static_cast<float>(device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>()) / 1024;
LOG(INFO) << "The local memory size of the chosen device is " << local_mem_kb
<< " KB.";
device_info_["CL_DEVICE_LOCAL_MEM_SIZE_KB"] = local_mem_kb;
auto global_mem_cache_size_kb =
static_cast<float>(device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHE_SIZE>()) /
1024;
LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHE_SIZE(KB):"
<< global_mem_cache_size_kb << " KB.";
device_info_["CL_DEVICE_GLOBAL_MEM_CACHE_SIZE_KB"] = global_mem_cache_size_kb;
auto global_mem_cacheline_size_kb =
static_cast<float>(
device_->getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>()) /
1024;
LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE(KB):"
<< global_mem_cacheline_size_kb << " KB.";
device_info_["CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE_KB"] =
global_mem_cacheline_size_kb;
auto global_mem_size_kb =
static_cast<float>(device_->getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()) / 1024;
LOG(INFO) << "CL_DEVICE_GLOBAL_MEM_SIZE(KB):" << global_mem_size_kb << " KB.";
device_info_["CL_DEVICE_GLOBAL_MEM_SIZE_KB"] = global_mem_size_kb;
// ===================== WORK_GROUP =====================
// CL_DEVICE_MAX_WORK_GROUP_SIZE
// CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS
// CL_DEVICE_MAX_WORK_ITEM_SIZES
auto max_work_group_size = device_->getInfo<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
LOG(INFO) << "CL_DEVICE_MAX_WORK_GROUP_SIZE:" << max_work_group_size;
device_info_["CL_DEVICE_MAX_WORK_GROUP_SIZE"] = max_work_group_size;
auto max_dims_num = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS>();
LOG(INFO) << "CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:" << max_dims_num;
device_info_["CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS"] = max_dims_num;
auto max_work_item_sizes = device_->getInfo<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
for (size_t i = 0; i < max_work_item_sizes.size(); ++i) {
LOG(INFO) << "max_work_item_sizes[" << i << "]:" << max_work_item_sizes[i];
std::string dim_key = "CL_DEVICE_MAX_WORK_ITEM_SIZES_" + std::to_string(i);
device_info_[dim_key] = max_work_item_sizes[i];
}
// ===================== BUFFER =====================
// CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE
auto max_constant_buffer_size_kb =
static_cast<float>(
device_->getInfo<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>()) /
1024;
LOG(INFO) << "CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE:"
<< max_constant_buffer_size_kb;
device_info_["CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE"] =
max_constant_buffer_size_kb;
// ===================== IMAGE =====================
// CL_DEVICE_IMAGE_SUPPORT
// CL_DEVICE_IMAGE2D_MAX_HEIGHT
// CL_DEVICE_IMAGE2D_MAX_WIDTH
auto image_support = device_->getInfo<CL_DEVICE_IMAGE_SUPPORT>();
if (image_support) {
LOG(INFO) << "The chosen device supports image processing.";
device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 1;
} else {
LOG(INFO) << "The chosen device doesn't support image processing!";
device_info_["CL_DEVICE_IMAGE_SUPPORT"] = 0;
return false;
}
auto image2d_max_height = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_HEIGHT:" << image2d_max_height;
device_info_["CL_DEVICE_IMAGE2D_MAX_HEIGHT"] = image2d_max_height;
auto image2d_max_width = device_->getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
LOG(INFO) << "CL_DEVICE_IMAGE2D_MAX_WIDTH:" << image2d_max_width;
device_info_["CL_DEVICE_IMAGE2D_MAX_WIDTH"] = image2d_max_width;
// ===================== OTHERS / EXTENSION / VERSION =====================
// CL_DEVICE_EXTENSIONS
// CL_DEVICE_ADDRESS_BITS
auto ext_data = device_->getInfo<CL_DEVICE_EXTENSIONS>();
VLOG(4) << "The extensions supported by this device: " << ext_data;
if (ext_data.find("cl_khr_fp16") != std::string::npos) {
LOG(INFO) << "The chosen device supports the half data type.";
device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 1;
} else {
LOG(INFO) << "The chosen device doesn't support the half data type!";
device_info_["CL_DEVICE_EXTENSIONS_FP16"] = 0;
}
auto max_units = device_->getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>();
LOG(INFO) << "The chosen device has " << max_units << " compute units.";
auto local_mem = device_->getInfo<CL_DEVICE_LOCAL_MEM_SIZE>();
LOG(INFO) << "The local memory size of the chosen device is "
<< static_cast<float>(local_mem) / 1024 << " KB.";
auto address_bits = device_->getInfo<CL_DEVICE_ADDRESS_BITS>();
LOG(INFO) << "CL_DEVICE_ADDRESS_BITS:" << address_bits;
device_info_["CL_DEVICE_ADDRESS_BITS"] = address_bits;
auto driver_version = device_->getInfo<CL_DRIVER_VERSION>();
LOG(INFO) << "CL_DRIVER_VERSION:" << driver_version;
return true;
}
std::map<std::string, size_t>& CLRuntime::GetDeviceInfo() {
if (0 != device_info_.size()) {
return device_info_;
}
InitializeDevice();
return device_info_;
}
} // namespace lite
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
......@@ -55,8 +52,10 @@ class CLRuntime {
void set_cl_path(std::string cl_path) { cl_path_ = cl_path; }
std::map<std::string, size_t>& GetDeviceInfo();
private:
CLRuntime() = default;
CLRuntime() { Init(); }
~CLRuntime();
......@@ -84,6 +83,8 @@ class CLRuntime {
return queue;
}
std::map<std::string, size_t> device_info_;
std::string cl_path_;
std::shared_ptr<cl::Platform> platform_{nullptr};
......
......@@ -32,7 +32,7 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \
__LINE__); \
}
#ifndef LITE_SHUTDOWN_LOG
#define CL_CHECK_FATAL(err_code__) \
if (err_code__ != CL_SUCCESS) { \
LOG(FATAL) << string_format( \
......@@ -42,5 +42,8 @@ const char* opencl_error_to_str(cl_int error);
__FILE__, \
__LINE__); \
}
#else
#define CL_CHECK_FATAL(err_code__)
#endif
} // namespace lite
} // namespace paddle
if(NOT LITE_WITH_RKNPU)
return()
endif()
lite_cc_library(device_rknpu SRCS device.cc DEPS ${rknpu_builder_libs} ${rknpu_runtime_libs})
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/rknpu/device.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace rknpu {
std::unique_ptr<rk::nn::Exection> Device::Build(
std::string& model_name, // NOLINT
rk::nn::Graph* rk_graph, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes // NOLINT
) {
VLOG(3) << "[RKNPU] Build model";
rk_graph->SetInputsOutputs(input_nodes, output_nodes);
std::unique_ptr<rk::nn::Exection> exector =
std::unique_ptr<rk::nn::Exection>(new rk::nn::Exection(rk_graph));
exector->Build();
return exector;
}
} // namespace rknpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "rknpu/rknpu_pub.h" // NOLINT
namespace paddle {
namespace lite {
namespace rknpu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() {}
// Build the RK IR graph to om model, return RK model exector to
// load om model and run inference.
std::unique_ptr<rk::nn::Exection> Build(
std::string& model_name, // NOLINT
rk::nn::Graph* rk_graph, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> input_nodes, // NOLINT
std::vector<std::shared_ptr<rk::nn::Tensor>> output_nodes // NOLINT
); // NOLINT
private:
};
} // namespace rknpu
} // namespace lite
} // namespace paddle
......@@ -10,7 +10,7 @@ if (LITE_ON_MODEL_OPTIMIZE_TOOL)
endif(LITE_ON_MODEL_OPTIMIZE_TOOL)
lite_cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
lite_cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml)
lite_cc_library(x86_cpu_info SRCS cpu_info.cc DEPS xbyak)
lite_cc_library(x86_cpu_info SRCS cpu_info.cc)
add_subdirectory(jit)
add_subdirectory(math)
......@@ -262,7 +262,7 @@ void* GetTensorRtDsoHandle() {
void* GetMKLMLDsoHandle() {
#if defined(__APPLE__) || defined(__OSX__)
return GetDsoHandleFromSearchPath(mklml_dir, "libmklml_intel.dylib");
return GetDsoHandleFromSearchPath(mklml_dir, "libmklml.dylib");
#elif defined(_WIN32)
return GetDsoHandleFromSearchPath(mklml_dir, "mklml.dll");
#else
......
......@@ -28,6 +28,12 @@
#define posix_memalign_free free
#endif
#ifdef _WIN32
#define posix_memalign_free _aligned_free
#define posix_memalign(p, a, s) \
(((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
#endif
// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
......@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
void* GenBase::operator new(size_t size) {
void* ptr;
constexpr size_t alignment = 32ul;
#ifdef _WIN32
ptr = _aligned_malloc(size, alignment);
#else
PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
0,
"GenBase Alloc %ld error!",
size);
#endif
PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
return ptr;
}
......
......@@ -96,8 +96,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
// : nullptr;
// fill in data
std::vector<size_t> low_level;
size_t low_offset = 0;
std::vector<uint64_t> low_level;
uint64_t low_offset = 0;
for (auto &items : selected_items) {
low_level.push_back(low_offset);
for (auto &item : items) {
......
......@@ -22,8 +22,8 @@ void PrepareCPUTensors(paddle::framework::LoDTensor* ids,
paddle::framework::LoDTensor* pre_scores) {
// lod
paddle::framework::LoD lod;
std::vector<size_t> level0({0, 2, 4});
std::vector<size_t> level1({0, 1, 2, 3, 4});
std::vector<uint64_t> level0({0, 2, 4});
std::vector<uint64_t> level1({0, 1, 2, 3, 4});
lod.push_back(level0);
lod.push_back(level1);
ids->set_lod(lod);
......
......@@ -483,7 +483,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
mat_a.data<T>(),
mat_b.data<T>(),
beta,
mat_out->mutable_data<T>());
mat_out->template mutable_data<T>());
}
template <>
......@@ -759,7 +759,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
mat_a.data<T>(),
mat_b.data<T>(),
beta,
mat_out->mutable_data<T>());
mat_out->template mutable_data<T>());
} else {
PADDLE_ENFORCE(dim_a.batch_size_ == dim_b.batch_size_ ||
dim_a.batch_size_ == 0 || dim_b.batch_size_ == 0);
......@@ -773,7 +773,7 @@ void Blas<Target>::MatMul(const lite::Tensor &mat_a,
mat_a.data<T>(),
mat_b.data<T>(),
beta,
mat_out->mutable_data<T>(),
mat_out->template mutable_data<T>(),
dim_a.batch_size_ == 0 ? dim_b.batch_size_ : dim_a.batch_size_,
dim_a.stride_,
dim_b.stride_);
......
......@@ -51,7 +51,7 @@ class ConcatFunctor<lite::TargetType::kX86, T> {
// auto cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation
auto output_data = output->mutable_data<T>();
auto output_data = output->template mutable_data<T>();
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = input_cols[j];
......@@ -108,7 +108,7 @@ class SplitFunctor<lite::TargetType::kX86, T> {
int col_len = output_cols[j];
auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) {
T* dst_ptr = out_tensor->mutable_data<T>() + k * col_len;
T* dst_ptr = out_tensor->template mutable_data<T>() + k * col_len;
std::copy_n(src_ptr + col_idx, col_len, dst_ptr);
// memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
// sizeof(T) * col_len);
......
......@@ -50,8 +50,8 @@ class CrossEntropyFunctor<lite::TargetType::kX86, T> {
.reshape(batch_axis_remain)
.sum(Eigen::DSizes<int, 1>(1)));
} else {
const T* prob_data = prob->data<T>();
T* loss_data = out->mutable_data<T>();
const T* prob_data = prob->template data<T>();
T* loss_data = out->template mutable_data<T>();
const int64_t* label_data = labels->data<int64_t>();
for (int i = 0; i < batch_size; ++i) {
......
......@@ -99,7 +99,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kCFO,
int channels_col = im_channels * filter_height * filter_width;
T* im_data = im->mutable_data<T>();
T* im_data = im->template mutable_data<T>();
const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) {
......@@ -161,7 +161,7 @@ class Im2ColFunctor<lite::x86::math::ColFormat::kOCF,
int col_width = col->dims()[1];
const T* im_data = im.data<T>();
T* col_data = col->mutable_data<T>();
T* col_data = col->template mutable_data<T>();
for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
......@@ -235,7 +235,7 @@ class Col2ImFunctor<lite::x86::math::ColFormat::kOCF,
"col_width and padding(padding_left, padding_right) are "
"inconsistent.");
T* im_data = im->mutable_data<T>();
T* im_data = im->template mutable_data<T>();
const T* col_data = col.data<T>();
for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
......
......@@ -42,7 +42,7 @@ inline void im2col_common(const lite::Tensor& im,
int channels_col = im_channels * filter_height * filter_width;
const T* im_data = im.data<T>();
T* col_data = col->mutable_data<T>();
T* col_data = col->template mutable_data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
int h_offset = (c / filter_width) % filter_height;
......@@ -77,7 +77,7 @@ inline void im2col_sh1sw1dh1dw1ph0pw0(const lite::Tensor& im,
int output_width = col->dims()[4];
const T* im_data = im.data<T>();
T* col_data = col->mutable_data<T>();
T* col_data = col->template mutable_data<T>();
int col_matrix_width = output_width * output_height;
int im_size = im_height * im_width;
size_t copy_size = sizeof(T) * output_width;
......@@ -123,7 +123,7 @@ inline void im2col_sh1sw1dh1dw1ph1pw1(const lite::Tensor& im,
constexpr int prw = 1;
const T* im_data = im.data<T>();
T* col_data = col->mutable_data<T>();
T* col_data = col->template mutable_data<T>();
int im_size = im_height * im_width;
int col_matrix_width = output_width * output_height;
int col_block_fh = filter_width * col_matrix_width; // fw*oh*ow
......
......@@ -65,7 +65,7 @@ struct TensorSetConstantCPU {
: tensor_(tensor), value_(value) {}
template <typename T>
void apply() const {
auto* begin = tensor_->mutable_data<T>(lite::TargetType::kX86);
auto* begin = tensor_->template mutable_data<T>(lite::TargetType::kX86);
std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
}
lite::Tensor* tensor_;
......@@ -126,7 +126,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
const T* input_data = input.data<T>();
const T* vector_data = vector.data<T>();
T* output_data = output->mutable_data<T>();
T* output_data = output->template mutable_data<T>();
for (int64_t i = 0; i < in_dims[0]; ++i) {
for (int64_t j = 0; j < size; ++j) {
output_data[i * in_dims[0] + j] =
......
......@@ -83,7 +83,7 @@ class ColwiseSum<lite::TargetType::kX86, T> {
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), size);
T* out_buf = out->mutable_data<T>(out->target());
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
......@@ -129,7 +129,7 @@ class RowwiseMean<lite::TargetType::kX86, T> {
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
auto inv_size = 1.0 / size;
T* out_buf = out->mutable_data<T>(out->target());
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
......@@ -173,7 +173,7 @@ class RowwiseSum<lite::TargetType::kX86, T> {
auto size = in_dims[1];
PADDLE_ENFORCE_EQ(out->numel(), height);
T* out_buf = out->mutable_data<T>(out->target());
T* out_buf = out->template mutable_data<T>(out->target());
const T* in_buf = input.data<T>();
for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
......
......@@ -35,7 +35,7 @@ class MaxOutFunctor<lite::TargetType::kX86, T> {
// c_size means the output size of each sample
int c_size = fea_size * output_channels;
const T* input_data = input.data<T>();
T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
for (int i = 0; i < batch_size; ++i) {
int new_bindex = c_size * i;
......@@ -72,7 +72,8 @@ class MaxOutGradFunctor<lite::TargetType::kX86, T> {
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
for (int i = 0; i < batch_size; ++i) {
int blen = fea_size * output_channels * i;
......
......@@ -54,8 +54,8 @@ class Pool2dFunctor<lite::TargetType::kX86, PoolProcess, T> {
const int input_stride = input_height * input_width;
const int output_stride = output_height * output_width;
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
const T* input_data = input->template data<T>();
T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
int hstart, hend;
int wstart, wend;
......@@ -137,7 +137,8 @@ class Pool2dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
int hstart, hend;
int wstart, wend;
......@@ -220,7 +221,8 @@ class MaxPool2dGradFunctor<lite::TargetType::kX86, T> {
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
......@@ -322,7 +324,7 @@ class Pool3dFunctor<lite::TargetType::kX86, PoolProcess, T> {
const int output_stride = output_depth * output_height * output_width;
const T* input_data = input.data<T>();
T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
int dstart, dend;
int hstart, hend;
......@@ -425,7 +427,8 @@ class Pool3dGradFunctor<lite::TargetType::kX86, PoolProcess, T> {
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
int dstart, dend;
int hstart, hend;
......@@ -530,7 +533,8 @@ class MaxPool3dGradFunctor<lite::TargetType::kX86, T> {
const T* input_data = input.data<T>();
const T* output_data = output.data<T>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
for (int i = 0; i < batch_size; i++) {
for (int c = 0; c < output_channels; ++c) {
......
......@@ -58,11 +58,11 @@ class SampleWithProb {
const int64_t* label_data = L->data<int64_t>();
// int64_t* samples_data =
// S->mutable_data<int64_t>(ret_dim, Target);
// T* probabilities_data = P->mutable_data<T>(ret_dim, Target);
// T* probabilities_data = P->template mutable_data<T>(ret_dim, Target);
S->Resize({batch_size, num_sampled_classes});
auto* samples_data = S->mutable_data<int64_t>(Target);
P->Resize({batch_size, num_sampled_classes});
auto* probabilities_data = P->mutable_data<T>(Target);
auto* probabilities_data = P->template mutable_data<T>(Target);
// temp sets for unique sampling
std::unordered_set<int64_t> tmp_samples;
......
......@@ -42,7 +42,7 @@ class SearchFcFunctor<lite::TargetType::kX86, T> {
lite::DDim dims(std::vector<int64_t>({bottom.dims()[0], out_size}));
const auto bottom_data = bottom.data<T>();
auto top_data = top->mutable_data<T>(lite::TargetType::kX86);
auto top_data = top->template mutable_data<T>(lite::TargetType::kX86);
const auto weights = w.data<T>();
auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
call_gemm<lite::X86Context, T>(blas,
......
......@@ -52,7 +52,7 @@ struct SelectedRowsAdd<lite::TargetType::kX86, T> {
PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
auto* out_data = out_value->mutable_data<T>();
auto* out_data = out_value->template mutable_data<T>();
auto* in1_data = in1_value.data<T>();
std::copy_n(in1_data, in1_value.numel(), out_data);
......@@ -87,7 +87,7 @@ struct SelectedRowsAddTensor<lite::TargetType::kX86, T> {
functor(context, output, 0.0);
auto* in1_data = in1_value.data<T>();
auto* out_data = output->mutable_data<T>();
auto* out_data = output->template mutable_data<T>();
for (size_t i = 0; i < in1_rows.size(); i++) {
for (int64_t j = 0; j < in1_row_numel; j++) {
......@@ -127,7 +127,7 @@ struct SelectedRowsAddTo<lite::TargetType::kX86, T> {
in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
auto* in1_data = in1_value.data<T>();
auto* in2_data = in2_value->mutable_data<T>();
auto* in2_data = in2_value->template mutable_data<T>();
std::copy_n(in1_data, in1_value.numel(), in2_data + input2_offset);
}
};
......@@ -161,7 +161,7 @@ struct SelectedRowsSumTo<lite::TargetType::kX86, T> {
input2->set_rows(in2_rows);
auto* in2_value = input2->mutable_value();
T* in2_data = in2_value->mutable_data<T>();
T* in2_data = in2_value->template mutable_data<T>();
auto blas = math::GetBlas<lite::TargetType::kX86, T>(context);
size_t offset = 0u;
for (size_t i = 0u; i != input1.size(); ++i) {
......@@ -194,7 +194,7 @@ struct SelectedRowsAddToTensor<lite::TargetType::kX86, T> {
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->mutable_data<T>();
auto* input2_data = input2->template mutable_data<T>();
for (size_t i = 0; i < in1_rows.size(); i++) {
for (int64_t j = 0; j < in1_row_numel; j++) {
......@@ -305,7 +305,7 @@ struct MergeAdd<lite::TargetType::kX86, T> {
lite::DDim dims(std::vector<int64_t>(
{static_cast<int64_t>(merged_row_set.size()), input_width}));
out.mutable_value()->Resize(dims);
auto* out_data = out.mutable_value()->mutable_data<T>();
auto* out_data = out.mutable_value()->template mutable_data<T>();
if (merged_row_set.size() == row_num && !sorted_result) {
// no duplicated ids, just concat the result together
......@@ -385,7 +385,7 @@ struct UpdateToTensor<lite::TargetType::kX86, T> {
PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
auto* in1_data = in1_value.data<T>();
auto* input2_data = input2->data<T>();
auto* input2_data = input2->template data<T>();
// FIXME(typhoonzero): use macro fix the below messy code.
switch (op) {
......
......@@ -24,10 +24,10 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
public:
void operator()(const lite::Context<lite::TargetType::kX86>& context,
const lite::Tensor& src,
const std::vector<size_t>& index_lod,
const std::vector<uint64_t>& index_lod,
lite::Tensor* dst,
bool is_src_index) {
const size_t* index = index_lod.data();
const uint64_t* index = index_lod.data();
const auto& src_dims = src.dims();
const auto& dst_dims = dst->dims();
PADDLE_ENFORCE_EQ(
......@@ -39,7 +39,7 @@ class CopyMatrixRowsFunctor<lite::TargetType::kX86, T> {
auto height = dst_dims[0];
auto width = dst_dims[1];
auto* src_data = src.data<T>();
auto* dst_data = dst->mutable_data<T>();
auto* dst_data = dst->template mutable_data<T>();
const int sz = width * sizeof(T);
if (is_src_index) {
for (int i = 0; i < height; ++i) {
......
......@@ -36,7 +36,7 @@ class CopyMatrixRowsFunctor {
// The indexed rows are based on the input index.
void operator()(const lite::Context<Target>& context,
const lite::Tensor& src,
const std::vector<size_t>& index_lod,
const std::vector<uint64_t>& index_lod,
lite::Tensor* dst,
bool is_src_index);
};
......@@ -130,8 +130,8 @@ class LoDTensor2BatchFunctor {
// batch_lods[2] is the sort order for the input LoDTensor.
batch_lods->at(2).resize(seq_info.size());
size_t* batch_starts = batch_lods->at(0).data();
size_t* seq2batch_idx = batch_lods->at(1).data();
auto* batch_starts = batch_lods->at(0).data();
auto* seq2batch_idx = batch_lods->at(1).data();
batch_starts[0] = 0;
for (int n = 0; n < max_seqlen; n++) {
auto batch_id = static_cast<int>(batch_starts[n]);
......@@ -148,7 +148,7 @@ class LoDTensor2BatchFunctor {
}
batch_starts[n + 1] = static_cast<size_t>(batch_id);
}
size_t* seq_order = batch_lods->at(2).data();
auto* seq_order = batch_lods->at(2).data();
for (size_t i = 0; i < seq_info.size(); ++i) {
seq_order[i] = seq_info[i].seq_idx;
}
......
......@@ -22,15 +22,15 @@ namespace math {
template <typename T>
void CopyValidData(lite::Tensor* dst_tensor,
const lite::Tensor* src_tensor,
const std::vector<size_t>& seq_offsets,
const std::vector<uint64_t>& seq_offsets,
int pad_seq_len,
int step_width,
bool norm_by_len,
CopyType type,
PadLayout layout) {
int seq_num = seq_offsets.size() - 1;
const T* src_data = src_tensor->data<T>();
T* dst_data = dst_tensor->mutable_data<T>();
const T* src_data = src_tensor->template data<T>();
T* dst_data = dst_tensor->template mutable_data<T>();
int seq_cpy_gap = step_width;
int pad_cpy_gap =
......@@ -113,7 +113,7 @@ class PaddingLoDTensorFunctor<lite::TargetType::kX86, T> {
"'step_width'.");
// fill padding value
T* pad_data = pad_tensor->mutable_data<T>();
T* pad_data = pad_tensor->template mutable_data<T>();
const T* pad_value_data = pad_value.data<T>();
if (pad_value.numel() == 1) {
fast_mem_init<T>(
......
......@@ -30,10 +30,10 @@ enum PadLayout { kBatchLengthWidth = 0, kLengthBatchWidth };
enum CopyType { kSeqToPad, kPadToSeq };
inline static size_t MaximumSequenceLength(
const std::vector<size_t>& seq_offset) {
size_t seq_num = seq_offset.size() - 1;
size_t max_seq_len = 0;
inline static uint64_t MaximumSequenceLength(
const std::vector<uint64_t>& seq_offset) {
uint64_t seq_num = seq_offset.size() - 1;
uint64_t max_seq_len = 0;
for (size_t i = 0; i < seq_num; ++i) {
max_seq_len = std::max(max_seq_len, seq_offset[i + 1] - seq_offset[i]);
}
......@@ -42,7 +42,7 @@ inline static size_t MaximumSequenceLength(
inline static void CheckDims(const lite::DDim& seq_tensor_dims,
const lite::DDim& pad_tensor_dims,
const std::vector<size_t>& seq_offset,
const std::vector<uint64_t>& seq_offset,
int64_t padded_seq_len,
int64_t step_width,
const PadLayout& layout) {
......
......@@ -55,7 +55,7 @@ class MaxSeqPoolFunctor {
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->mutable_data<T>();
T* out_data = output->template mutable_data<T>();
int* max_index = index->mutable_data<int>();
int64_t num_seq = out_dims[0];
......@@ -103,7 +103,7 @@ class MaxSeqPoolFunctor<T, true> {
auto starts = input.lod()[0];
const T* in_data = input.data<T>();
T* out_data = output->mutable_data<T>();
T* out_data = output->template mutable_data<T>();
int64_t num_seq = out_dims[0];
int64_t dim = output->numel() / num_seq;
......@@ -145,7 +145,7 @@ class MaxSeqPoolGradFunctor {
const T* og_data = out_grad.data<T>();
const int* max_index = index.data<int>();
T* ig_data = in_grad->mutable_data<T>();
T* ig_data = in_grad->template mutable_data<T>();
SetConstant<TARGET(kX86), T> set_zero;
set_zero(context, in_grad, static_cast<T>(0.0));
......@@ -170,7 +170,7 @@ class LastSeqPoolFunctor {
lite::Tensor* output) {
// Create pointers to input and output data
auto* in_data = input.data<T>();
auto* out_data = output->mutable_data<T>();
auto* out_data = output->template mutable_data<T>();
// Calculate the size of each item in sequence
int64_t item_size = input.numel() / input.dims()[0];
......@@ -203,7 +203,7 @@ class FirstSeqPoolFunctor {
lite::Tensor* output) {
// Create pointers to input and output data
auto* in_data = input.data<T>();
auto* out_data = output->mutable_data<T>();
auto* out_data = output->template mutable_data<T>();
// Calculate the size of each item in sequence
int64_t item_size = input.numel() / input.dims()[0];
......@@ -238,7 +238,7 @@ class SumSeqPoolGradFunctor {
int64_t in_w = in_grad->numel() / in_grad->dims()[0];
PADDLE_ENFORCE(in_w == out_w);
const T* out_g_data = out_grad.data<T>();
T* in_g_data = in_grad->mutable_data<T>(TARGET(kX86));
T* in_g_data = in_grad->template mutable_data<T>(TARGET(kX86));
auto blas = math::GetBlas<TARGET(kX86), T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
......@@ -288,7 +288,7 @@ class SequencePoolFunctor<TARGET(kX86), T> {
auto lod = input.lod()[0];
if (pooltype == "SUM") {
const T* src = input.data<T>();
T* dst = output->mutable_data<T>(TARGET(kX86));
T* dst = output->template mutable_data<T>(TARGET(kX86));
jit::seq_pool_attr_t attr(
static_cast<int>(input.numel() / input.dims()[0]),
jit::SeqPoolType::kSum);
......
......@@ -101,13 +101,13 @@ void TestSequencePoolingSum(const paddle::framework::LoD& lod) {
TEST(SequencePoolingGrad, CPU_SUM) {
paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10});
lod1.push_back(std::vector<uint64_t>{0, 10});
TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace,
float>(lod1);
paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
TestSequencePoolingSum<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace,
float>(lod2);
......@@ -116,13 +116,13 @@ TEST(SequencePoolingGrad, CPU_SUM) {
#ifdef PADDLE_WITH_CUDA
TEST(SequencePoolingGrad, CUDA_SUM) {
paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10});
lod1.push_back(std::vector<uint64_t>{0, 10});
TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace,
float>(lod1);
paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
lod2.push_back(std::vector<uint64_t>{0, 2, 7, 10});
TestSequencePoolingSum<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace,
float>(lod2);
......
......@@ -32,7 +32,7 @@ class ScaleLoDTensorFunctor<lite::TargetType::kX86, T> {
size_t seq_width = seq->dims()[1];
lite::LoD abs_offset_lod = lite::fluid::ToAbsOffset(lod);
T* seq_data = seq->mutable_data<T>(lite::TargetType::kX86);
T* seq_data = seq->template mutable_data<T>(lite::TargetType::kX86);
for (size_t i = 0; i < num_seq; ++i) {
for (size_t j = lod[level][i] * seq_width;
j < lod[level][i + 1] * seq_width;
......
......@@ -83,7 +83,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
auto pos_data = pos->mutable_data<int>(lite::TargetType::kX86);
int offset = 0;
std::vector<size_t> vec_out_lod;
std::vector<uint64_t> vec_out_lod;
vec_out_lod.reserve(batch_size + 1);
for (int i = 0; i <= batch_size; ++i) {
offset = row_lod[i];
......@@ -95,7 +95,7 @@ class SequenceTopkAvgPoolingFunctor<lite::TargetType::kX86, T> {
out->set_lod(lod_temp);
auto in_data = in.data<T>();
auto out_data = out->mutable_data<T>(lite::TargetType::kX86);
auto out_data = out->template mutable_data<T>(lite::TargetType::kX86);
T* sum_data = new T[max_k];
for (int i = 0; i < batch_size; ++i) {
......
......@@ -108,8 +108,8 @@ class SoftmaxFunctor<Target, T, is_test, enable_if_CPU<Target>> {
const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
const T* in_data = X->data<T>();
auto* out_data = Y->mutable_data<T>();
const T* in_data = X->template data<T>();
auto* out_data = Y->template mutable_data<T>();
for (int bs = 0; bs < batch_size; ++bs) {
T max_val = *std::max_element(in_data, in_data + num_classes);
max_val *= static_cast<T>(-1);
......@@ -219,9 +219,9 @@ class SoftmaxGradFunctor<Target, T, enable_if_CPU<Target>> {
const int num_remain = num_classes / axis_dim;
if (num_remain == 1 && lite::x86::MayIUse(lite::x86::avx)) {
const T* out_data = y->data<T>();
const T* out_grad = y_grad->data<T>();
T* in_grad = x_grad->mutable_data<T>();
const T* out_data = y->template data<T>();
const T* out_grad = y_grad->template data<T>();
T* in_grad = x_grad->template mutable_data<T>();
for (int bs = 0; bs < batch_size; ++bs) {
T scalar;
vec_mul_reduce<T, lite::x86::avx>(
......
......@@ -104,12 +104,12 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> {
patch_size = processing_list.size();
// T *patch_data =
// patch->mutable_data<T>({static_cast<int64_t>(patch_size),
// patch->template mutable_data<T>({static_cast<int64_t>(patch_size),
// static_cast<int64_t>(patch_elem_size)},
// cpu_place);
patch->Resize({static_cast<int64_t>(patch_size),
static_cast<int64_t>(patch_elem_size)});
auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86);
auto *patch_data = patch->template mutable_data<T>(lite::TargetType::kX86);
constant(context, patch, 0);
const T *features = node_features.data<T>();
......@@ -166,12 +166,12 @@ class Col2TreeFunctor<lite::TargetType::kX86, T> {
}
}
// T *grad_data =
// in_grad->mutable_data<T>({static_cast<int64_t>(node_count),
// in_grad->template mutable_data<T>({static_cast<int64_t>(node_count),
// static_cast<int64_t>(grad_elem_size)},
// cpu_place);
in_grad->Resize({static_cast<int64_t>(node_count),
static_cast<int64_t>(grad_elem_size)});
auto *grad_data = in_grad->mutable_data<T>(lite::TargetType::kX86);
auto *grad_data = in_grad->template mutable_data<T>(lite::TargetType::kX86);
constant(context, in_grad, 0);
const T *out_g = out_grad.data<T>();
......
......@@ -36,7 +36,7 @@ class Unpool2dMaxFunctor<lite::TargetType::kX86, T> {
int output_feasize = output_height * output_width;
const T* input_data = input.data<T>();
const int* indices_data = indices.data<int>();
T* output_data = output->mutable_data<T>(lite::TargetType::kX86);
T* output_data = output->template mutable_data<T>(lite::TargetType::kX86);
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < output_channels; ++c) {
for (int i = 0; i < input_feasize; ++i) {
......@@ -70,7 +70,8 @@ class Unpool2dMaxGradFunctor<lite::TargetType::kX86, T> {
int output_feasize = output_height * output_width;
const int* indices_data = indices.data<int>();
const T* output_grad_data = output_grad.data<T>();
T* input_grad_data = input_grad->mutable_data<T>(lite::TargetType::kX86);
T* input_grad_data =
input_grad->template mutable_data<T>(lite::TargetType::kX86);
for (int b = 0; b < batch_size; ++b) {
for (int c = 0; c < output_channels; ++c) {
......
......@@ -75,7 +75,7 @@ class Vol2ColFunctor<lite::TargetType::kX86, T> {
"mismatching.");
const T* vol_data = vol.data<T>();
T* col_data = col->mutable_data<T>();
T* col_data = col->template mutable_data<T>();
for (int c = 0; c < channels_col; ++c) {
int w_offset = c % filter_width;
......@@ -159,7 +159,7 @@ class Col2VolFunctor<lite::TargetType::kX86, T> {
output_width,
"input_width and output_width are "
"mismatching.");
T* vol_data = vol->mutable_data<T>();
T* vol_data = vol->template mutable_data<T>();
const T* col_data = col.data<T>();
for (int c = 0; c < channels_col; ++c) {
......
......@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() {
// Do not support nested omp parallem.
num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
#endif
return std::max(num_threads, 1L);
return std::max<int>(num_threads, 1L);
}
using ThreadHandler =
......
......@@ -14,10 +14,10 @@
#pragma once
#include <time.h>
#include <cstdio>
#include <stdexcept>
#include <time.h>
#include <memory>
#include <string>
......@@ -37,7 +37,9 @@
#define GOOGLE_GLOG_DLL_DECL
#include <io.h> // _popen, _pclose
#include <stdio.h>
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h>
#include <winsock.h>
#include <numeric> // std::accumulate in msvc
#ifndef S_ISDIR // windows port for sys/stat.h
#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
......@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) {
return reinterpret_cast<void *>(hModule);
}
extern struct timeval;
static int gettimeofday(struct timeval *tp, void *tzp) {
time_t clock;
struct tm tm;
......
......@@ -2,4 +2,7 @@ if(NOT LITE_WITH_XPU)
return()
endif()
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
if(LITE_WITH_XTCL)
lite_cc_library(device_xpu SRCS device.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
endif()
lite_cc_library(target_wrapper_xpu SRCS target_wrapper.cc DEPS ${xpu_builder_libs} ${xpu_runtime_libs})
......@@ -14,12 +14,12 @@
#pragma once
#include <xtcl/xtcl.h>
#include <cstdlib>
#include <memory>
#include <string>
#include <utility>
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
namespace paddle {
namespace lite {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <cmath>
#include <cstdlib>
#include <utility>
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
namespace xpu {
namespace math {
static inline long round_half_to_even(const float src) { // NOLINT
long ret = llround(src); // NOLINT
if (fabs(fabs(round(src) - src) - 0.5) > 0) {
return ret;
} else {
if (abs(ret) % 2 == 0) {
return ret;
} else {
return ret + (ret > 0 ? -1 : 1);
}
}
}
static float ieee_compliance_0(float f) {
uint32_t *ptr = reinterpret_cast<uint32_t *>(&f);
uint32_t sign = (*ptr) & 0x80000000;
uint32_t uf = 0;
// nan -> inf
if (std::isnan(f)) {
uf = (sign | 0x7F800000);
float *ptr = reinterpret_cast<float *>(&uf);
return *ptr;
} else if (std::isnormal(f) || (std::isinf(f)) || (f == 0)) {
return f;
} else {
// denormal -> +-0
uf = 0x0;
float *ptr = reinterpret_cast<float *>(&uf);
return *ptr;
}
}
template <typename T, int RMAX>
static inline T fp32_to_intx(const float f, float max) {
max = ieee_compliance_0(max);
float input = ieee_compliance_0(f);
// +0 and -0 -> +0
if (input == 0) {
input = 0.0f;
}
float tmp = RMAX / max;
if (std::isinf(tmp)) {
uint32_t *ptr = reinterpret_cast<uint32_t *>(&input);
if ((*ptr) >> 31 & 1) {
return T(-RMAX);
} else {
return T(RMAX);
}
}
tmp = input * tmp;
if (std::isnan(tmp)) {
return T(RMAX);
}
tmp = ieee_compliance_0(tmp);
// early check to avoid INF or big value get into convertor func.
if (tmp > RMAX) {
return T(RMAX);
}
if (tmp < -RMAX) {
return T(-RMAX);
}
T ret = (T)round_half_to_even(tmp);
if (ret > RMAX) {
ret = T(RMAX);
}
if (ret < -RMAX) {
ret = T(-RMAX);
}
return ret;
}
static inline int16_t fp32_to_int16(const float f, float max) {
int16_t v1 = fp32_to_intx<int16_t, 32767>(f, max);
return v1;
}
static inline int ConvertFP32ToInt16(const void *input,
void *output,
float max_val,
int len) {
for (int i = 0; i < len; i++) {
static_cast<int16_t *>(output)[i] =
fp32_to_int16(static_cast<const float *>(input)[i], max_val);
}
return 0;
}
static inline float FindMaxAbs(const float *data, int len) {
float max_f = 0.0f;
for (int i = 0; i < len; ++i) {
float max = std::abs(data[i]);
if (max > max_f) {
max_f = max;
}
}
return max_f;
}
template <typename T>
static inline void Transpose(const T *in, T *out, int h, int w) {
for (int h1 = 0; h1 < w; ++h1) {
for (int w1 = 0; w1 < h; ++w1) {
out[h1 * h + w1] = in[w1 * w + h1];
}
}
}
/**
* Get row matrix shape from a vector shape. If the rank of x_dim > 1, the
* original x_dim is returned.
*/
static lite::DDim RowMatrixFromVector(const lite::DDim &x_dim) {
if (x_dim.size() > 1) {
return x_dim;
}
return lite::DDim({1, x_dim[0]});
}
/**
* Get column matrix shape from a vector shape. If the rank of y_dim > 1, the
* original y_dim is returned.
*/
static lite::DDim ColumnMatrixFromVector(const lite::DDim &y_dim) {
if (y_dim.size() > 1) {
return y_dim;
}
return lite::DDim({y_dim[0], 1});
}
/**
* Matrix Descriptor of a memory buffer.
*
* It is used for Blas::MatMul. MatMul operator can be batched.
* if Mat A is [BatchSize, H, W], Mat B is [BatchSize, H, W]. It will be a
* `batch_size` times of GEMM. The batched GEMM could be faster base on the
* implementation of the blas library. The batch size could be zero. If any
* matrix of `matmul` has a batch size, the will be a batched GEMM, too. e.g.,
* Mat A is [BatchSize, H1, W2], and Mat B [H2, W2], The result matrix wil be
* [BatchSize, H1, W2]
*
* The boolean flag, `trans`, describe the memory is the transpose of matrix or
* not. If the trans is true, the last two dims of matrix are transposed. The
* memory layout of the matrix is [Width, Height] or [BatchSize, Width, Height].
*
* The MatDescriptor is not only the dimension or shape of a matrix, it also
* contains the layout, stride of matrix. It is clearer to have a structure than
* reuse `DDim`.
*/
struct MatDescriptor {
int64_t height_;
int64_t width_;
int64_t stride_{0};
int64_t batch_size_{0};
bool trans_;
};
static MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
int num_flatten_cols,
bool trans) {
MatDescriptor retv;
if (num_flatten_cols > 1) {
auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
retv.height_ = flatten_dim[0];
retv.width_ = flatten_dim[1];
} else {
if (tensor_dim.size() == 2) {
retv.height_ = tensor_dim[0];
retv.width_ = tensor_dim[1];
} else {
auto dim_vec = tensor_dim.Vectorize();
retv.batch_size_ = 1;
for (size_t i = 0; i < dim_vec.size() - 2; ++i) {
retv.batch_size_ *= dim_vec[i];
}
retv.height_ = dim_vec[dim_vec.size() - 2];
retv.width_ = dim_vec[dim_vec.size() - 1];
retv.stride_ = retv.height_ * retv.width_;
}
}
if (trans) {
std::swap(retv.width_, retv.height_);
}
retv.trans_ = trans;
return retv;
}
} // namespace math
} // namespace xpu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/xpu/target_wrapper.h"
#include "lite/backends/xpu/xpu_header_sitter.h"
namespace paddle {
namespace lite {
void* TargetWrapperXPU::Malloc(size_t size) {
void* ptr{nullptr};
xpu_malloc(&ptr, size);
return ptr;
}
void TargetWrapperXPU::Free(void* ptr) { xpu_free(ptr); }
void TargetWrapperXPU::MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir) {
switch (dir) {
case IoDirection::HtoD:
xpu_memcpy(dst, src, size, XPU_HOST_TO_DEVICE);
break;
case IoDirection::DtoH:
xpu_memcpy(dst, src, size, XPU_DEVICE_TO_HOST);
break;
default:
LOG(FATAL) << "Unsupported IoDirection " << static_cast<int>(dir);
}
}
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "lite/core/target_wrapper.h"
namespace paddle {
namespace lite {
using TargetWrapperXPU = TargetWrapper<TARGET(kXPU)>;
template <>
class TargetWrapper<TARGET(kXPU)> {
public:
static size_t num_devices() { return 1; }
static size_t maximum_stream() { return 0; }
static void* Malloc(size_t size);
static void Free(void* ptr);
static void MemcpySync(void* dst,
const void* src,
size_t size,
IoDirection dir);
};
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#pragma GCC system_header
#include <xpu/api.h>
#include <xpu/golden.h>
#include <xpu/runtime.h>
#if defined(LITE_WITH_XTCL)
#include <xtcl/xtcl.h>
#endif
namespace paddle {
namespace lite {
namespace xdnn = baidu::xpu::api;
} // namespace lite
} // namespace paddle
......@@ -5,6 +5,7 @@ lite_cc_library(target_wrapper SRCS target_wrapper.cc
DEPS target_wrapper_host place
X86_DEPS target_wrapper_x86
CUDA_DEPS target_wrapper_cuda
XPU_DEPS target_wrapper_xpu
CL_DEPS cl_target_wrapper
FPGA_DEPS fpga_target_wrapper
BM_DEPS target_wrapper_bm
......@@ -37,7 +38,7 @@ lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
if (LITE_WITH_ARM)
lite_cc_library(context SRCS context.cc DEPS tensor any device_info CL_DEPS cl_context)
else()
lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context)
lite_cc_library(context SRCS context.cc DEPS tensor any device_info eigen3 CL_DEPS cl_context CUDA_DEPS cuda_context)
endif()
#-------------------------------------------- GET CODE META INFO ------------------------------------------
......
......@@ -6,5 +6,5 @@ endif()
lite_cc_library(arena_framework SRCS framework.cc DEPS program gtest)
if((NOT LITE_WITH_OPENCL) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
lite_cc_test(test_arena_framework SRCS framework_test.cc DEPS arena_framework ${rknpu_kernels} ${mlu_kernels} ${bm_kernels} ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${fpga_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
endif()
......@@ -15,5 +15,11 @@
#include "lite/core/context.h"
namespace paddle {
namespace lite {} // namespace lite
namespace lite {
#ifdef LITE_WITH_XPU
thread_local xdnn::Context* Context<TargetType::kXPU>::_tls_raw_ctx{nullptr};
#endif
} // namespace lite
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册