提交 80af11c6 编写于 作者: J jiweibo

Merge remote-tracking branch 'origin' into test_result

...@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " ...@@ -36,6 +36,31 @@ message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
message(STATUS "AR tools: ${CMAKE_AR}") message(STATUS "AR tools: ${CMAKE_AR}")
if(WIN32)
option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib)
add_definitions("/DGOOGLE_GLOG_DLL_DECL=")
if (MSVC_STATIC_CRT)
set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT")
endif()
add_compile_options(/wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838)
add_compile_options(/MP)
message(STATUS "Using parallel compiling (/MP)")
set(PADDLE_LINK_FLAGS "/IGNORE:4006 /IGNORE:4098 /IGNORE:4217 /IGNORE:4221")
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
endif()
if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
find_package(CUDA QUIET) find_package(CUDA QUIET)
endif() endif()
...@@ -64,6 +89,7 @@ lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF) ...@@ -64,6 +89,7 @@ lite_option(LITE_WITH_MLU "Enable MLU in lite mode" OFF)
lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF) lite_option(LITE_WITH_XPU "Enable XPU in lite mode" OFF)
lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU) lite_option(LITE_WITH_XTCL "Enable XPU via XTCL" OFF IF LITE_WITH_XPU)
lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF) lite_option(LITE_WITH_BM "Enable BM in lite mode" OFF)
lite_option(LITE_WITH_APU "Enable APU in lite mode" OFF)
lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF) lite_option(LITE_WITH_TRAIN "Enable training operators and kernels in lite" OFF)
lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON) lite_option(LITE_WITH_OPENMP "Enable OpenMP in lite framework" ON)
lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF) lite_option(LITE_WITH_OPENCL "Enable OpenCL support in lite" OFF)
...@@ -106,9 +132,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING ...@@ -106,9 +132,16 @@ set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
# CMAKE_BUILD_TYPE # CMAKE_BUILD_TYPE
if(NOT CMAKE_BUILD_TYPE) if(NOT CMAKE_BUILD_TYPE)
if(WIN32)
set(CMAKE_BUILD_TYPE "Release" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE)
else()
set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
"Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel" "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
FORCE) FORCE)
endif()
endif() endif()
message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}") message(STATUS "CMAKE_BUILD_TYPE: ${CMAKE_BUILD_TYPE}")
...@@ -141,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK) ...@@ -141,6 +174,7 @@ if (WITH_LITE AND LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
include(cross_compiling/postproject) include(cross_compiling/postproject)
include(device/npu) # check and prepare NPU DDK include(device/npu) # check and prepare NPU DDK
include(device/xpu) # check and prepare XPU SDK include(device/xpu) # check and prepare XPU SDK
include(device/apu) # check and prepare APU SDK
# We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON # We compile the mobile deployment library when LITE_ON_TINY_PUBLISH=ON
# So the following third party dependencies are not needed. # So the following third party dependencies are not needed.
...@@ -190,6 +224,7 @@ endif() ...@@ -190,6 +224,7 @@ endif()
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
include(external/xbyak) # download xbyak package include(external/xbyak) # download xbyak package
include(external/libxsmm) # download, build, install libxsmm include(external/libxsmm) # download, build, install libxsmm
include(external/gflags) # download, build, install gflags include(external/gflags) # download, build, install gflags
include(external/glog) # download, build, install glog include(external/glog) # download, build, install glog
......
@echo off
setlocal
setlocal enabledelayedexpansion
set source_path=%~dp0
rem global variables
set BUILD_EXTRA=OFF
set BUILD_JAVA=ON
set BUILD_PYTHON=OFF
set BUILD_DIR=%source_path%
set OPTMODEL_DIR=""
set BUILD_TAILOR=OFF
set BUILD_CV=OFF
set SHUTDOWN_LOG=ON
set THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz
set workspace=%source_path%
:set_vcvarsall_dir
SET /P vcvarsall_dir="Please input the path of visual studio command Prompt, such as C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat =======>"
set tmp_var=!vcvarsall_dir!
call:remove_space
set vcvarsall_dir=!tmp_var!
IF NOT EXIST "%vcvarsall_dir%" (
echo "------------%vcvarsall_dir% not exist------------"
goto set_vcvarsall_dir
)
call:prepare_thirdparty
if EXIST "%build_directory%" (
call:rm_rebuild_dir "%build_directory%"
md "%build_directory%"
)
set root_dir=%workspace%
set build_directory=%BUILD_DIR%\build.lite.x86
set GEN_CODE_PATH_PREFIX=%build_directory%\lite\gen_code
set DEBUG_TOOL_PATH_PREFIX=%build_directory%\lite\tools\debug
rem for code gen, a source file is generated after a test, but is dependended by some targets in cmake.
rem here we fake an empty file to make cmake works.
if NOT EXIST "%GEN_CODE_PATH_PREFIX%" (
md "%GEN_CODE_PATH_PREFIX%"
)
type nul >"%GEN_CODE_PATH_PREFIX%\__generated_code__.cc"
if NOT EXIST "%DEBUG_TOOL_PATH_PREFIX%" (
md "%DEBUG_TOOL_PATH_PREFIX%"
)
copy "%root_dir%\lite\tools\debug\analysis_tool.py" "%DEBUG_TOOL_PATH_PREFIX%\"
cd "%build_directory%"
cmake .. -G "Visual Studio 14 2015 Win64" -T host=x64 -DWITH_MKL=ON ^
-DWITH_MKLDNN=OFF ^
-DLITE_WITH_X86=ON ^
-DLITE_WITH_PROFILE=OFF ^
-DWITH_LITE=ON ^
-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF ^
-DLITE_WITH_ARM=OFF ^
-DWITH_GPU=OFF ^
-DLITE_BUILD_EXTRA=ON ^
-DLITE_WITH_PYTHON=ON ^
-DPYTHON_EXECUTABLE="%python_path%"
call "%vcvarsall_dir%" amd64
msbuild /m /p:Configuration=Release lite\publish_inference.vcxproj >mylog.txt 2>&1
goto:eof
:prepare_thirdparty
SET /P python_path="Please input the path of python.exe, such as C:\Python35\python.exe, C:\Python35\python3.exe =======>"
set tmp_var=!python_path!
call:remove_space
set python_path=!tmp_var!
if "!python_path!"=="" (
set python_path=python.exe
) else (
if NOT exist "!python_path!" (
echo "------------!python_path! not exist------------"
goto:eof
)
)
if EXIST "%workspace%\third-party" (
if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
echo "The directory of third_party exists, the third-party-05b862.tar.gz not exists."
) else (
echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
call:rm_rebuild_dir "%workspace%\third-party"
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
)
) else (
if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
call:download_third_party
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
) else (
echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
!python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
)
)
git submodule update --init --recursive
goto:eof
:download_third_party
powershell.exe (new-object System.Net.WebClient).DownloadFile('https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz', ^
'%workspace%third-party-05b862.tar.gz')
goto:eof
:rm_rebuild_dir
del /f /s /q "%~1\*.*" >nul 2>&1
rd /s /q "%~1" >nul 2>&1
goto:eof
:remove_space
:remove_left_space
if "%tmp_var:~0,1%"==" " (
set "tmp_var=%tmp_var:~1%"
goto remove_left_space
)
:remove_right_space
if "%tmp_var:~-1%"==" " (
set "tmp_var=%tmp_var:~0,-1%"
goto remove_left_space
)
goto:eof
\ No newline at end of file
...@@ -34,6 +34,15 @@ elseif(SSE3_FOUND) ...@@ -34,6 +34,15 @@ elseif(SSE3_FOUND)
set(SIMD_FLAG ${SSE3_FLAG}) set(SIMD_FLAG ${SSE3_FLAG})
endif() endif()
if(WIN32)
# windows header option for all targets.
add_definitions(-D_XKEYCHECK_H)
if (NOT MSVC)
message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
endif(NOT MSVC)
endif(WIN32)
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
add_definitions(-DLITE_WITH_CUDA) add_definitions(-DLITE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
...@@ -134,6 +143,10 @@ if (LITE_WITH_NPU) ...@@ -134,6 +143,10 @@ if (LITE_WITH_NPU)
add_definitions("-DLITE_WITH_NPU") add_definitions("-DLITE_WITH_NPU")
endif() endif()
if (LITE_WITH_APU)
add_definitions("-DLITE_WITH_APU")
endif()
if (LITE_WITH_RKNPU) if (LITE_WITH_RKNPU)
add_definitions("-DLITE_WITH_RKNPU") add_definitions("-DLITE_WITH_RKNPU")
endif() endif()
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
if(NOT LITE_WITH_APU)
return()
endif()
if(NOT DEFINED APU_DDK_ROOT)
set(APU_DDK_ROOT $ENV{APU_DDK_ROOT})
if(NOT APU_DDK_ROOT)
message(FATAL_ERROR "Must set APU_DDK_ROOT or env APU_DDK_ROOT when LITE_WITH_APU=ON")
endif()
endif()
message(STATUS "APU_DDK_ROOT: ${APU_DDK_ROOT}")
find_path(APU_DDK_INC NAMES NeuronAdapter.h
PATHS ${APU_DDK_ROOT}/include NO_DEFAULT_PATH)
if(NOT APU_DDK_INC)
message(FATAL_ERROR "Can not find NeuronAdapter.h in ${APU_DDK_ROOT}/include")
endif()
message(STATUS "APU_DDK_INC: ${APU_DDK_INC}")
include_directories("${APU_DDK_ROOT}/include")
set(APU_SUB_LIB_PATH "lib64")
if(ARM_TARGET_ARCH_ABI STREQUAL "armv8")
set(APU_SUB_LIB_PATH "lib64")
endif()
find_library(APU_NEURON_FILE NAMES neuron
PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
find_library(APU_NEURON_ADAPTER_FILE NAMES neuron_adapter
PATHS ${APU_DDK_ROOT}/${APU_SUB_LIB_PATH})
if(NOT APU_NEURON_FILE)
message(FATAL_ERROR "Can not find APU_NEURON_FILE in ${APU_DDK_ROOT}")
else()
message(STATUS "Found APU NEURON Library: ${APU_NEURON_FILE}")
add_library(apu_neuron SHARED IMPORTED GLOBAL)
set_property(TARGET apu_neuron PROPERTY IMPORTED_LOCATION ${APU_NEURON_FILE})
endif()
if(NOT APU_NEURON_ADAPTER_FILE)
message(FATAL_ERROR "Can not find APU_NEURON_ADAPTER_FILE in ${APU_DDK_ROOT}")
else()
message(STATUS "Found APU NEURON ADAPTER Library: ${APU_NEURON_ADAPTER_FILE}")
add_library(apu_neuron_adapter SHARED IMPORTED GLOBAL)
set_property(TARGET apu_neuron_adapter PROPERTY IMPORTED_LOCATION ${APU_NEURON_ADAPTER_FILE})
endif()
set(apu_runtime_libs apu_neuron apu_neuron_adapter CACHE INTERNAL "apu runtime libs")
message(STATUS "${apu_runtime_libs}")
...@@ -32,6 +32,7 @@ IF(WIN32) ...@@ -32,6 +32,7 @@ IF(WIN32)
SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib) SET(MKLML_LIB ${MKLML_LIB_DIR}/mklml.lib)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.lib)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_LIB_DEPS ${MKLML_LIB_DIR}/msvcr120.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
ELSEIF(APPLE) ELSEIF(APPLE)
#TODO(intel-huying): #TODO(intel-huying):
......
...@@ -22,7 +22,7 @@ endfunction() ...@@ -22,7 +22,7 @@ endfunction()
function (lite_deps TARGET) function (lite_deps TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS ARGS) set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS HVY_DEPS CL_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS ARGS)
cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(deps ${lite_deps_DEPS}) set(deps ${lite_deps_DEPS})
...@@ -88,6 +88,12 @@ function (lite_deps TARGET) ...@@ -88,6 +88,12 @@ function (lite_deps TARGET)
endforeach(var) endforeach(var)
endif() endif()
if (LITE_WITH_APU)
foreach(var ${lite_deps_APU_DEPS})
set(deps ${deps} ${var})
endforeach(var)
endif()
if (LITE_WITH_RKNPU) if (LITE_WITH_RKNPU)
foreach(var ${lite_deps_RKNPU_DEPS}) foreach(var ${lite_deps_RKNPU_DEPS})
set(deps ${deps} ${var}) set(deps ${deps} ${var})
...@@ -137,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean ...@@ -137,7 +143,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
function(lite_cc_library TARGET) function(lite_cc_library TARGET)
set(options SHARED shared STATIC static MODULE module) set(options SHARED shared STATIC static MODULE module)
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS CV_DEPS PROFILE_DEPS LIGHT_DEPS
HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS) HVY_DEPS EXCLUDE_COMPILE_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -153,6 +159,7 @@ function(lite_cc_library TARGET) ...@@ -153,6 +159,7 @@ function(lite_cc_library TARGET)
CV_DEPS ${args_CV_DEPS} CV_DEPS ${args_CV_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
PROFILE_DEPS ${args_PROFILE_DEPS} PROFILE_DEPS ${args_PROFILE_DEPS}
LIGHT_DEPS ${args_LIGHT_DEPS} LIGHT_DEPS ${args_LIGHT_DEPS}
...@@ -168,8 +175,10 @@ function(lite_cc_library TARGET) ...@@ -168,8 +175,10 @@ function(lite_cc_library TARGET)
else() else()
cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
endif() endif()
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
# collect targets need to compile for lite # collect targets need to compile for lite
if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS) if (args_SRCS AND NOT args_EXCLUDE_COMPILE_DEPS)
add_dependencies(lite_compile_deps ${TARGET}) add_dependencies(lite_compile_deps ${TARGET})
...@@ -184,7 +193,7 @@ function(lite_cc_binary TARGET) ...@@ -184,7 +193,7 @@ function(lite_cc_binary TARGET)
set(options " -g ") set(options " -g ")
endif() endif()
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS) LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -197,6 +206,7 @@ function(lite_cc_binary TARGET) ...@@ -197,6 +206,7 @@ function(lite_cc_binary TARGET)
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
...@@ -207,7 +217,9 @@ function(lite_cc_binary TARGET) ...@@ -207,7 +217,9 @@ function(lite_cc_binary TARGET)
MLU_DEPS ${args_MLU_DEPS} MLU_DEPS ${args_MLU_DEPS}
) )
cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps}) cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
if (NOT APPLE) if (NOT APPLE)
# strip binary target to reduce size # strip binary target to reduce size
if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug") if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
...@@ -234,7 +246,7 @@ function(lite_cc_test TARGET) ...@@ -234,7 +246,7 @@ function(lite_cc_test TARGET)
endif() endif()
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS CV_DEPS
ARGS ARGS
COMPILE_LEVEL # (basic|extra) COMPILE_LEVEL # (basic|extra)
...@@ -255,6 +267,7 @@ function(lite_cc_test TARGET) ...@@ -255,6 +267,7 @@ function(lite_cc_test TARGET)
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
...@@ -272,7 +285,9 @@ function(lite_cc_test TARGET) ...@@ -272,7 +285,9 @@ function(lite_cc_test TARGET)
"${TARGET}" "${TARGET}"
COMMENT "Strip debug symbols done on final executable file.") COMMENT "Strip debug symbols done on final executable file.")
endif() endif()
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers) if(NOT WIN32)
target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
endif()
file(APPEND ${offline_test_registry_file} "${TARGET}\n") file(APPEND ${offline_test_registry_file} "${TARGET}\n")
# collect targets need to compile for lite # collect targets need to compile for lite
...@@ -286,6 +301,7 @@ set(x86_kernels CACHE INTERNAL "x86 kernels") ...@@ -286,6 +301,7 @@ set(x86_kernels CACHE INTERNAL "x86 kernels")
set(cuda_kernels CACHE INTERNAL "cuda kernels") set(cuda_kernels CACHE INTERNAL "cuda kernels")
set(fpga_kernels CACHE INTERNAL "fpga kernels") set(fpga_kernels CACHE INTERNAL "fpga kernels")
set(npu_kernels CACHE INTERNAL "npu kernels") set(npu_kernels CACHE INTERNAL "npu kernels")
set(apu_kernels CACHE INTERNAL "apu kernels")
set(xpu_kernels CACHE INTERNAL "xpu kernels") set(xpu_kernels CACHE INTERNAL "xpu kernels")
set(mlu_kernels CACHE INTERNAL "mlu kernels") set(mlu_kernels CACHE INTERNAL "mlu kernels")
set(bm_kernels CACHE INTERNAL "bm kernels") set(bm_kernels CACHE INTERNAL "bm kernels")
...@@ -305,12 +321,12 @@ if(LITE_BUILD_TAILOR) ...@@ -305,12 +321,12 @@ if(LITE_BUILD_TAILOR)
file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list) file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
endif() endif()
# add a kernel for some specific device # add a kernel for some specific device
# device: one of (Host, ARM, X86, NPU, MLU, FPGA, OPENCL, CUDA, BM, RKNPU) # device: one of (Host, ARM, X86, NPU, MLU, APU, FPGA, OPENCL, CUDA, BM, RKNPU)
# level: one of (basic, extra) # level: one of (basic, extra)
function(add_kernel TARGET device level) function(add_kernel TARGET device level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS RKNPU_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -368,6 +384,15 @@ function(add_kernel TARGET device level) ...@@ -368,6 +384,15 @@ function(add_kernel TARGET device level)
endif() endif()
set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "") set(npu_kernels "${npu_kernels};${TARGET}" CACHE INTERNAL "")
endif() endif()
if ("${device}" STREQUAL "APU")
if (NOT LITE_WITH_APU)
foreach(src ${args_SRCS})
file(APPEND ${fake_kernels_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
endforeach()
return()
endif()
set(apu_kernels "${apu_kernels};${TARGET}" CACHE INTERNAL "")
endif()
if ("${device}" STREQUAL "XPU") if ("${device}" STREQUAL "XPU")
if (NOT LITE_WITH_XPU) if (NOT LITE_WITH_XPU)
foreach(src ${args_SRCS}) foreach(src ${args_SRCS})
...@@ -451,6 +476,7 @@ function(add_kernel TARGET device level) ...@@ -451,6 +476,7 @@ function(add_kernel TARGET device level)
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
...@@ -473,7 +499,7 @@ endif() ...@@ -473,7 +499,7 @@ endif()
function(add_operator TARGET level) function(add_operator TARGET level)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS PROFILE_DEPS set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS CL_DEPS ARM_DEPS FPGA_DEPS BM_DEPS NPU_DEPS XPU_DEPS MLU_DEPS APU_DEPS PROFILE_DEPS
LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS LIGHT_DEPS HVY_DEPS EXCLUDE_COMPILE_DEPS
ARGS) ARGS)
cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
...@@ -506,6 +532,7 @@ function(add_operator TARGET level) ...@@ -506,6 +532,7 @@ function(add_operator TARGET level)
ARM_DEPS ${args_ARM_DEPS} ARM_DEPS ${args_ARM_DEPS}
FPGA_DEPS ${args_FPGA_DEPS} FPGA_DEPS ${args_FPGA_DEPS}
NPU_DEPS ${args_NPU_DEPS} NPU_DEPS ${args_NPU_DEPS}
APU_DEPS ${args_APU_DEPS}
XPU_DEPS ${args_XPU_DEPS} XPU_DEPS ${args_XPU_DEPS}
RKNPU_DEPS ${args_RKNPU_DEPS} RKNPU_DEPS ${args_RKNPU_DEPS}
BM_DEPS ${args_BM_DEPS} BM_DEPS ${args_BM_DEPS}
...@@ -516,6 +543,29 @@ function(add_operator TARGET level) ...@@ -516,6 +543,29 @@ function(add_operator TARGET level)
) )
endfunction() endfunction()
#only for windows
function(create_static_lib TARGET_NAME)
set(libs ${ARGN})
list(REMOVE_DUPLICATES libs)
set(dummy_index 1)
set(dummy_offset 1)
# the dummy target would be consisted of limit size libraries
set(dummy_limit 60)
list(LENGTH libs libs_len)
foreach(lib ${libs})
list(APPEND dummy_list ${lib})
list(LENGTH dummy_list listlen)
if ((${listlen} GREATER ${dummy_limit}) OR (${dummy_offset} EQUAL ${libs_len}))
merge_static_libs(${TARGET_NAME}_dummy_${dummy_index} ${dummy_list})
set(dummy_list)
list(APPEND ${TARGET_NAME}_dummy_list ${TARGET_NAME}_dummy_${dummy_index})
MATH(EXPR dummy_index "${dummy_index}+1")
endif()
MATH(EXPR dummy_offset "${dummy_offset}+1")
endforeach()
merge_static_libs(${TARGET_NAME} ${${TARGET_NAME}_dummy_list})
endfunction()
# Bundle several static libraries into one. # Bundle several static libraries into one.
function(bundle_static_library tgt_name bundled_tgt_name fake_target) function(bundle_static_library tgt_name bundled_tgt_name fake_target)
...@@ -559,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target) ...@@ -559,7 +609,22 @@ function(bundle_static_library tgt_name bundled_tgt_name fake_target)
set(bundled_tgt_full_name set(bundled_tgt_full_name
${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}) ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
#message(STATUS "bundled_tgt_full_name: ${bundled_tgt_full_name}") message(STATUS "bundled_tgt_full_name: ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX}")
if(WIN32)
set(dummy_tgt_name dummy_${bundled_tgt_name})
create_static_lib(${bundled_tgt_name} ${static_libs})
add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_name})
add_dependencies(${fake_target} ${tgt_name})
add_library(${dummy_tgt_name} STATIC IMPORTED)
set_target_properties(${dummy_tgt_name}
PROPERTIES
IMPORTED_LOCATION ${bundled_tgt_full_name}
INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
add_dependencies(${dummy_tgt_name} ${fake_target})
return()
endif()
if(NOT IOS) if(NOT IOS)
file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
......
...@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}") ...@@ -9,6 +9,7 @@ message(STATUS "LITE_WITH_OPENCL:\t${LITE_WITH_OPENCL}")
message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}") message(STATUS "LITE_WITH_NPU:\t${LITE_WITH_NPU}")
message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}") message(STATUS "LITE_WITH_RKNPU:\t${LITE_WITH_RKNPU}")
message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}") message(STATUS "LITE_WITH_XPU:\t${LITE_WITH_XPU}")
message(STATUS "LITE_WITH_APU:\t${LITE_WITH_APU}")
message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}") message(STATUS "LITE_WITH_XTCL:\t${LITE_WITH_XTCL}")
message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}") message(STATUS "LITE_WITH_FPGA:\t${LITE_WITH_FPGA}")
message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}") message(STATUS "LITE_WITH_MLU:\t${LITE_WITH_MLU}")
...@@ -71,6 +72,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM) ...@@ -71,6 +72,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
if (LITE_WITH_XPU) if (LITE_WITH_XPU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu") set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.xpu")
endif(LITE_WITH_XPU) endif(LITE_WITH_XPU)
if (LITE_WITH_APU)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.apu")
endif(LITE_WITH_APU)
if (LITE_WITH_FPGA) if (LITE_WITH_FPGA)
set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga") set(INFER_LITE_PUBLISH_ROOT "${INFER_LITE_PUBLISH_ROOT}.fpga")
endif(LITE_WITH_FPGA) endif(LITE_WITH_FPGA)
...@@ -87,6 +91,38 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}") ...@@ -87,6 +91,38 @@ message(STATUS "publish inference lib to ${INFER_LITE_PUBLISH_ROOT}")
# add python lib # add python lib
if (LITE_WITH_PYTHON) if (LITE_WITH_PYTHON)
if(WIN32)
set(LITE_CORE "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd")
set(LITE_CORE_DEPS ${LITE_CORE})
add_custom_command(OUTPUT ${LITE_CORE}
COMMAND cmake -E copy $<TARGET_FILE:lite_pybind> ${LITE_CORE}
DEPENDS lite_pybind)
add_custom_target(copy_lite_pybind ALL DEPENDS ${LITE_CORE_DEPS})
add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/lib"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/libs"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/setup.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/python/__init__.py" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/install/lite/lite.pyd"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/python/pybind/${CMAKE_BUILD_TYPE}/lite.pyd" "${INFER_LITE_PUBLISH_ROOT}/python/lib/lite.pyd"
DEPENDS copy_lite_pybind
)
add_custom_target(publish_inference_python_installer ${TARGET}
COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
WORKING_DIRECTORY ${INFER_LITE_PUBLISH_ROOT}/python/install/
DEPENDS publish_inference_python_lib)
add_custom_target(publish_inference_python_light_demo ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/python"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_light_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/demo/python/mobilenetv1_full_api.py" "${INFER_LITE_PUBLISH_ROOT}/demo/python/"
)
add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo)
else()
if(APPLE) if(APPLE)
add_custom_target(publish_inference_python_lib ${TARGET} add_custom_target(publish_inference_python_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/python/lib"
...@@ -123,6 +159,7 @@ if (LITE_WITH_PYTHON) ...@@ -123,6 +159,7 @@ if (LITE_WITH_PYTHON)
add_dependencies(publish_inference publish_inference_python_lib) add_dependencies(publish_inference publish_inference_python_lib)
add_dependencies(publish_inference publish_inference_python_installer) add_dependencies(publish_inference publish_inference_python_installer)
add_dependencies(publish_inference publish_inference_python_light_demo) add_dependencies(publish_inference publish_inference_python_light_demo)
endif(WIN32)
endif() endif()
if (LITE_WITH_CUDA OR LITE_WITH_X86) if (LITE_WITH_CUDA OR LITE_WITH_X86)
...@@ -141,7 +178,7 @@ if (LITE_WITH_CUDA OR LITE_WITH_X86) ...@@ -141,7 +178,7 @@ if (LITE_WITH_CUDA OR LITE_WITH_X86)
add_dependencies(publish_inference_cxx_lib paddle_light_api_shared) add_dependencies(publish_inference_cxx_lib paddle_light_api_shared)
add_dependencies(publish_inference publish_inference_cxx_lib) add_dependencies(publish_inference publish_inference_cxx_lib)
add_dependencies(publish_inference publish_inference_third_party) add_dependencies(publish_inference publish_inference_third_party)
else() elseif(NOT WIN32)
add_custom_target(publish_inference_cxx_lib ${TARGET} add_custom_target(publish_inference_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
...@@ -164,6 +201,36 @@ if (LITE_WITH_CUDA OR LITE_WITH_X86) ...@@ -164,6 +201,36 @@ if (LITE_WITH_CUDA OR LITE_WITH_X86)
endif() endif()
if (LITE_WITH_X86) if (LITE_WITH_X86)
if(WIN32)
add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_api.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_place.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_use_passes.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_SOURCE_DIR}/lite/api/paddle_lite_factory_helper.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_full_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_BINARY_DIR}/lite/api/${CMAKE_BUILD_TYPE}/libpaddle_api_light_bundled.lib" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
)
add_dependencies(publish_inference_x86_cxx_lib bundle_full_api)
add_dependencies(publish_inference_x86_cxx_lib bundle_light_api)
add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_custom_target(publish_inference_x86_cxx_demos ${TARGET}
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/install" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E copy_directory "${CMAKE_BINARY_DIR}/third_party/eigen3" "${INFER_LITE_PUBLISH_ROOT}/third_party"
COMMAND ${CMAKE_COMMAND} -E make_directory "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
)
add_dependencies(publish_inference_x86_cxx_lib publish_inference_x86_cxx_demos)
add_dependencies(publish_inference_x86_cxx_demos paddle_api_full_bundled eigen3)
else()
add_custom_target(publish_inference_x86_cxx_lib ${TARGET} add_custom_target(publish_inference_x86_cxx_lib ${TARGET}
COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/bin"
COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin" COMMAND cp "${CMAKE_BINARY_DIR}/lite/api/test_model_bin" "${INFER_LITE_PUBLISH_ROOT}/bin"
...@@ -178,6 +245,7 @@ if (LITE_WITH_X86) ...@@ -178,6 +245,7 @@ if (LITE_WITH_X86)
add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3) add_dependencies(publish_inference_x86_cxx_demos paddle_full_api_shared eigen3)
add_dependencies(publish_inference publish_inference_x86_cxx_lib) add_dependencies(publish_inference publish_inference_x86_cxx_lib)
add_dependencies(publish_inference publish_inference_x86_cxx_demos) add_dependencies(publish_inference publish_inference_x86_cxx_demos)
endif()
endif() endif()
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
......
...@@ -23,6 +23,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ...@@ -23,6 +23,9 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL) if (NOT LITE_ON_MODEL_OPTIMIZE_TOOL)
add_dependencies(paddle_full_api_shared dynload_mklml) add_dependencies(paddle_full_api_shared dynload_mklml)
endif() endif()
if(WIN32)
target_link_libraries(paddle_full_api_shared shlwapi.lib)
endif()
endif() endif()
if(LITE_WITH_CUDA) if(LITE_WITH_CUDA)
target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive") target_link_libraries(paddle_full_api_shared ${math_cuda} "-Wl,--whole-archive" ${cuda_kernels} "-Wl,--no-whole-archive")
...@@ -34,12 +37,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH ...@@ -34,12 +37,13 @@ if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_CUDA OR LITE_WITH_X86 OR LITE_WITH
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
) )
add_dependencies(paddle_light_api_shared op_list_h kernel_list_h) add_dependencies(paddle_light_api_shared op_list_h kernel_list_h)
target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels}) target_link_libraries(paddle_light_api_shared ${light_lib_DEPS} ${arm_kernels} ${npu_kernels} ${rknpu_kernels} ${apu_kernels})
if(NOT APPLE) if(NOT APPLE AND NOT WIN32)
set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map") set(LINK_MAP_FILE "${PADDLE_SOURCE_DIR}/lite/core/lite.map")
set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}") set(LINK_FLAGS "-Wl,--version-script ${LINK_MAP_FILE}")
add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...) add_custom_command(OUTPUT ${LINK_MAP_FILE} COMMAND ...)
...@@ -78,7 +82,9 @@ if (WITH_TESTING) ...@@ -78,7 +82,9 @@ if (WITH_TESTING)
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
MLU_DEPS ${mlu_kernels}) MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels})
endif() endif()
if(LITE_WITH_FPGA) if(LITE_WITH_FPGA)
set(light_api_deps ${light_api_deps} ${fpga_deps}) set(light_api_deps ${light_api_deps} ${fpga_deps})
...@@ -103,6 +109,7 @@ message(STATUS "get Host kernels ${host_kernels}") ...@@ -103,6 +109,7 @@ message(STATUS "get Host kernels ${host_kernels}")
message(STATUS "get ARM kernels ${arm_kernels}") message(STATUS "get ARM kernels ${arm_kernels}")
message(STATUS "get OpenCL kernels ${opencl_kernels}") message(STATUS "get OpenCL kernels ${opencl_kernels}")
message(STATUS "get NPU kernels ${npu_kernels}") message(STATUS "get NPU kernels ${npu_kernels}")
message(STATUS "get APU kernels ${apu_kernels}")
message(STATUS "get XPU kernels ${xpu_kernels}") message(STATUS "get XPU kernels ${xpu_kernels}")
message(STATUS "get RKNPU kernels ${rknpu_kernels}") message(STATUS "get RKNPU kernels ${rknpu_kernels}")
message(STATUS "get FPGA kernels ${fpga_kernels}") message(STATUS "get FPGA kernels ${fpga_kernels}")
...@@ -122,6 +129,7 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -122,6 +129,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
...@@ -143,6 +151,7 @@ lite_cc_library(light_api SRCS light_api.cc ...@@ -143,6 +151,7 @@ lite_cc_library(light_api SRCS light_api.cc
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
...@@ -163,6 +172,7 @@ if(WITH_TESTING) ...@@ -163,6 +172,7 @@ if(WITH_TESTING)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
...@@ -220,7 +230,7 @@ if(WITH_TESTING) ...@@ -220,7 +230,7 @@ if(WITH_TESTING)
endif() endif()
if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING) if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${fpga_kernels}) set(lite_model_test_DEPS cxx_api mir_passes ${ops} ${host_kernels} ${arm_kernels} ${npu_kernels} ${apu_kernels} ${fpga_kernels})
lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc lite_cc_test(test_mobilenetv1_int8 SRCS mobilenetv1_int8_test.cc
DEPS ${lite_model_test_DEPS} DEPS ${lite_model_test_DEPS}
...@@ -292,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -292,6 +302,7 @@ if (NOT LITE_ON_TINY_PUBLISH)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
BM_DEPS ${bm_kernels}) BM_DEPS ${bm_kernels})
...@@ -356,6 +367,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle ...@@ -356,6 +367,7 @@ lite_cc_test(test_paddle_api SRCS paddle_api_test.cc DEPS paddle_api_full paddle
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
APU_DEPS ${apu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
...@@ -376,6 +388,7 @@ if(NOT IOS) ...@@ -376,6 +388,7 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
...@@ -390,6 +403,7 @@ if(NOT IOS) ...@@ -390,6 +403,7 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
...@@ -404,6 +418,7 @@ if(NOT IOS) ...@@ -404,6 +418,7 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
BM_DEPS ${bm_kernels} BM_DEPS ${bm_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
...@@ -419,6 +434,7 @@ if(NOT IOS) ...@@ -419,6 +434,7 @@ if(NOT IOS)
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
...@@ -429,6 +445,7 @@ if(NOT IOS) ...@@ -429,6 +445,7 @@ if(NOT IOS)
ARM_DEPS ${arm_kernels} ARM_DEPS ${arm_kernels}
CV_DEPS paddle_cv_arm CV_DEPS paddle_cv_arm
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
APU_DEPS ${apu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
RKNPU_DEPS ${rknpu_kernels} RKNPU_DEPS ${rknpu_kernels}
MLU_DEPS ${mlu_kernels} MLU_DEPS ${mlu_kernels}
...@@ -445,6 +462,7 @@ if(NOT IOS) ...@@ -445,6 +462,7 @@ if(NOT IOS)
NPU_DEPS ${npu_kernels} NPU_DEPS ${npu_kernels}
RKNPU_DEPS ${npu_kernels} RKNPU_DEPS ${npu_kernels}
XPU_DEPS ${xpu_kernels} XPU_DEPS ${xpu_kernels}
APU_DEPS ${apu_kernels}
CL_DEPS ${opencl_kernels} CL_DEPS ${opencl_kernels}
FPGA_DEPS ${fpga_kernels} FPGA_DEPS ${fpga_kernels}
X86_DEPS ${x86_kernels} X86_DEPS ${x86_kernels}
......
...@@ -13,7 +13,13 @@ ...@@ -13,7 +13,13 @@
// limitations under the License. // limitations under the License.
#include <gflags/gflags.h> #include <gflags/gflags.h>
#if !defined(_WIN32)
#include <sys/time.h> #include <sys/time.h>
#else
#include <windows.h>
#include "lite/backends/x86/port.h"
#endif
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include <time.h> #include <time.h>
#include <algorithm> #include <algorithm>
#include <cstdio> #include <cstdio>
......
...@@ -19,6 +19,11 @@ ...@@ -19,6 +19,11 @@
#include "lite/api/paddle_api.h" #include "lite/api/paddle_api.h"
#include "lite/core/device_info.h" #include "lite/core/device_info.h"
#include "lite/core/version.h" #include "lite/core/version.h"
#ifndef LITE_ON_TINY_PUBLISH
#include "lite/api/paddle_use_passes.h"
#endif
#if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \ #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
!(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__) !(defined LITE_ON_MODEL_OPTIMIZE_TOOL) && !defined(__APPLE__)
#include <omp.h> #include <omp.h>
......
...@@ -82,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) { ...@@ -82,7 +82,7 @@ Tensor* LightPredictor::GetInputByName(const std::string& name) {
if (element == input_names_.end()) { if (element == input_names_.end()) {
LOG(ERROR) << "Model do not have input named with: [" << name LOG(ERROR) << "Model do not have input named with: [" << name
<< "], model's inputs include:"; << "], model's inputs include:";
for (int i = 0; i < input_names_.size(); i++) { for (size_t i = 0; i < input_names_.size(); i++) {
LOG(ERROR) << "[" << input_names_[i] << "]"; LOG(ERROR) << "[" << input_names_[i] << "]";
} }
return nullptr; return nullptr;
...@@ -114,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() { ...@@ -114,7 +114,7 @@ void LightPredictor::PrepareFeedFetch() {
auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0); auto current_block = cpp_program_desc_.GetBlock<cpp::BlockDesc>(0);
std::vector<cpp::OpDesc*> feeds; std::vector<cpp::OpDesc*> feeds;
std::vector<cpp::OpDesc*> fetchs; std::vector<cpp::OpDesc*> fetchs;
for (int i = 0; i < current_block->OpsSize(); i++) { for (size_t i = 0; i < current_block->OpsSize(); i++) {
auto op = current_block->GetOp<cpp::OpDesc>(i); auto op = current_block->GetOp<cpp::OpDesc>(i);
if (op->Type() == "feed") { if (op->Type() == "feed") {
feeds.push_back(op); feeds.push_back(op);
...@@ -124,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() { ...@@ -124,11 +124,11 @@ void LightPredictor::PrepareFeedFetch() {
} }
input_names_.resize(feeds.size()); input_names_.resize(feeds.size());
output_names_.resize(fetchs.size()); output_names_.resize(fetchs.size());
for (int i = 0; i < feeds.size(); i++) { for (size_t i = 0; i < feeds.size(); i++) {
input_names_[feeds[i]->GetAttr<int>("col")] = input_names_[feeds[i]->GetAttr<int>("col")] =
feeds[i]->Output("Out").front(); feeds[i]->Output("Out").front();
} }
for (int i = 0; i < fetchs.size(); i++) { for (size_t i = 0; i < fetchs.size(); i++) {
output_names_[fetchs[i]->GetAttr<int>("col")] = output_names_[fetchs[i]->GetAttr<int>("col")] =
fetchs[i]->Input("X").front(); fetchs[i]->Input("X").front();
} }
......
...@@ -37,11 +37,11 @@ TEST(LightAPI, load) { ...@@ -37,11 +37,11 @@ TEST(LightAPI, load) {
const std::vector<std::string> inputs = predictor.GetInputNames(); const std::vector<std::string> inputs = predictor.GetInputNames();
LOG(INFO) << "input size: " << inputs.size(); LOG(INFO) << "input size: " << inputs.size();
for (int i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
LOG(INFO) << "inputnames: " << inputs[i]; LOG(INFO) << "inputnames: " << inputs[i];
} }
const std::vector<std::string> outputs = predictor.GetOutputNames(); const std::vector<std::string> outputs = predictor.GetOutputNames();
for (int i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
LOG(INFO) << "outputnames: " << outputs[i]; LOG(INFO) << "outputnames: " << outputs[i];
} }
......
...@@ -293,13 +293,13 @@ int main(int argc, char** argv) { ...@@ -293,13 +293,13 @@ int main(int argc, char** argv) {
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape); std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes; std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) { for (size_t i = 0; i < str_input_shapes.size(); ++i) {
input_shapes.push_back(get_shape(str_input_shapes[i])); input_shapes.push_back(get_shape(str_input_shapes[i]));
} }
std::vector<std::string> str_input_shapes_0 = std::vector<std::string> str_input_shapes_0 =
split_string(FLAGS_input_shape_0); split_string(FLAGS_input_shape_0);
std::vector<std::vector<int64_t>> input_shapes_0; std::vector<std::vector<int64_t>> input_shapes_0;
for (int i = 0; i < str_input_shapes_0.size(); ++i) { for (size_t i = 0; i < str_input_shapes_0.size(); ++i) {
input_shapes_0.push_back(get_shape(str_input_shapes_0[i])); input_shapes_0.push_back(get_shape(str_input_shapes_0[i]));
} }
......
...@@ -44,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir, ...@@ -44,9 +44,15 @@ void OutputOptModel(const std::string& load_model_dir,
const std::vector<std::vector<int64_t>>& input_shapes) { const std::vector<std::vector<int64_t>>& input_shapes) {
lite_api::CxxConfig config; lite_api::CxxConfig config;
config.set_model_dir(load_model_dir); config.set_model_dir(load_model_dir);
#ifdef LITE_WITH_X86
config.set_valid_places({Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kX86), PRECISION(kInt64)},
Place{TARGET(kHost), PRECISION(kFloat)}});
#else
config.set_valid_places({ config.set_valid_places({
Place{TARGET(kARM), PRECISION(kFloat)}, Place{TARGET(kARM), PRECISION(kFloat)},
}); });
#endif
auto predictor = lite_api::CreatePaddlePredictor(config); auto predictor = lite_api::CreatePaddlePredictor(config);
// delete old optimized model // delete old optimized model
...@@ -198,7 +204,7 @@ int main(int argc, char** argv) { ...@@ -198,7 +204,7 @@ int main(int argc, char** argv) {
LOG(INFO) << "input shapes: " << FLAGS_input_shape; LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape); std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes; std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) { for (size_t i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i]; LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i])); input_shapes.push_back(get_shape(str_input_shapes[i]));
} }
......
...@@ -310,7 +310,7 @@ int main(int argc, char** argv) { ...@@ -310,7 +310,7 @@ int main(int argc, char** argv) {
LOG(INFO) << "input shapes: " << FLAGS_input_shape; LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape); std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes; std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) { for (size_t i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i]; LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i])); input_shapes.push_back(get_shape(str_input_shapes[i]));
} }
......
...@@ -114,7 +114,7 @@ void detect_object(const float* dout, ...@@ -114,7 +114,7 @@ void detect_object(const float* dout,
} }
std::string name = FLAGS_out_txt + "_accu.txt"; std::string name = FLAGS_out_txt + "_accu.txt";
FILE* fp = fopen(name.c_str(), "w"); FILE* fp = fopen(name.c_str(), "w");
for (int i = 0; i < objects.size(); ++i) { for (size_t i = 0; i < objects.size(); ++i) {
Object object = objects.at(i); Object object = objects.at(i);
if (object.prob > thresh && object.x > 0 && object.y > 0 && if (object.prob > thresh && object.x > 0 && object.y > 0 &&
object.width > 0 && object.height > 0) { object.width > 0 && object.height > 0) {
...@@ -324,7 +324,7 @@ int main(int argc, char** argv) { ...@@ -324,7 +324,7 @@ int main(int argc, char** argv) {
LOG(INFO) << "input shapes: " << FLAGS_input_shape; LOG(INFO) << "input shapes: " << FLAGS_input_shape;
std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape); std::vector<std::string> str_input_shapes = split_string(FLAGS_input_shape);
std::vector<std::vector<int64_t>> input_shapes; std::vector<std::vector<int64_t>> input_shapes;
for (int i = 0; i < str_input_shapes.size(); ++i) { for (size_t i = 0; i < str_input_shapes.size(); ++i) {
LOG(INFO) << "input shape: " << str_input_shapes[i]; LOG(INFO) << "input shape: " << str_input_shapes[i];
input_shapes.push_back(get_shape(str_input_shapes[i])); input_shapes.push_back(get_shape(str_input_shapes[i]));
} }
......
...@@ -104,17 +104,21 @@ std::vector<Place> ParserValidPlaces() { ...@@ -104,17 +104,21 @@ std::vector<Place> ParserValidPlaces() {
valid_places.emplace_back( valid_places.emplace_back(
TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel TARGET(kARM)); // enable kARM CPU kernel when no opencl kernel
} else if (target_repr == "x86") { } else if (target_repr == "x86") {
valid_places.emplace_back(TARGET(kX86)); valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kFloat)});
valid_places.emplace_back(Place{TARGET(kX86), PRECISION(kInt64)});
} else if (target_repr == "npu") { } else if (target_repr == "npu") {
valid_places.emplace_back(TARGET(kNPU)); valid_places.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places.emplace_back(TARGET(kXPU)); valid_places.emplace_back(TARGET(kXPU));
} else if (target_repr == "mlu") {
valid_places.emplace_back(TARGET(kMLU));
} else if (target_repr == "rknpu") { } else if (target_repr == "rknpu") {
valid_places.emplace_back(TARGET(kRKNPU)); valid_places.emplace_back(TARGET(kRKNPU));
valid_places.emplace_back( valid_places.emplace_back(
TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW)); TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
} else if (target_repr == "mlu") { } else if (target_repr == "apu") {
valid_places.emplace_back(TARGET(kMLU)); valid_places.emplace_back(
Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
} else { } else {
LOG(FATAL) << lite::string_format( LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag " "Wrong target '%s' found, please check the command flag "
...@@ -192,6 +196,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) { ...@@ -192,6 +196,7 @@ void PrintOpsInfo(std::set<std::string> valid_ops = {}) {
"kNPU", "kNPU",
"kXPU", "kXPU",
"kRKNPU", "kRKNPU",
"kAPU",
"kAny", "kAny",
"kUnk"}; "kUnk"};
int maximum_optype_length = 0; int maximum_optype_length = 0;
...@@ -256,16 +261,16 @@ void PrintHelpInfo() { ...@@ -256,16 +261,16 @@ void PrintHelpInfo() {
" `--param_file=<param_path>`\n" " `--param_file=<param_path>`\n"
" `--optimize_out_type=(protobuf|naive_buffer)`\n" " `--optimize_out_type=(protobuf|naive_buffer)`\n"
" `--optimize_out=<output_optimize_model_dir>`\n" " `--optimize_out=<output_optimize_model_dir>`\n"
" `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`\n" " `--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" `--record_tailoring_info=(true|false)`\n" " `--record_tailoring_info=(true|false)`\n"
" Arguments of model checking and ops information:\n" " Arguments of model checking and ops information:\n"
" `--print_all_ops=true` Display all the valid operators of " " `--print_all_ops=true` Display all the valid operators of "
"Paddle-Lite\n" "Paddle-Lite\n"
" `--print_supported_ops=true " " `--print_supported_ops=true "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
" Display valid operators of input targets\n" " Display valid operators of input targets\n"
" `--print_model_ops=true --model_dir=<model_param_dir> " " `--print_model_ops=true --model_dir=<model_param_dir> "
"--valid_targets=(arm|opencl|x86|npu|xpu|rknpu)`" "--valid_targets=(arm|opencl|x86|npu|xpu|rknpu|apu)`"
" Display operators in the input model\n"; " Display operators in the input model\n";
std::cout << "opt version:" << opt_version << std::endl std::cout << "opt version:" << opt_version << std::endl
<< help_info << std::endl; << help_info << std::endl;
......
...@@ -63,6 +63,13 @@ void OptBase::SetValidPlaces(const std::string& valid_places) { ...@@ -63,6 +63,13 @@ void OptBase::SetValidPlaces(const std::string& valid_places) {
valid_places_.emplace_back(TARGET(kNPU)); valid_places_.emplace_back(TARGET(kNPU));
} else if (target_repr == "xpu") { } else if (target_repr == "xpu") {
valid_places_.emplace_back(TARGET(kXPU)); valid_places_.emplace_back(TARGET(kXPU));
} else if (target_repr == "rknpu") {
valid_places_.emplace_back(TARGET(kRKNPU));
valid_places_.emplace_back(
TARGET(kRKNPU), PRECISION(kInt8), DATALAYOUT(kNCHW));
} else if (target_repr == "apu") {
valid_places_.emplace_back(
Place{TARGET(kAPU), PRECISION(kInt8), DATALAYOUT(kNCHW)});
} else { } else {
LOG(FATAL) << lite::string_format( LOG(FATAL) << lite::string_format(
"Wrong target '%s' found, please check the command flag " "Wrong target '%s' found, please check the command flag "
...@@ -183,7 +190,7 @@ void OptBase::PrintHelpInfo() { ...@@ -183,7 +190,7 @@ void OptBase::PrintHelpInfo() {
" `set_param_file(param_file_path)`\n" " `set_param_file(param_file_path)`\n"
" `set_model_type(protobuf|naive_buffer)`\n" " `set_model_type(protobuf|naive_buffer)`\n"
" `set_optimize_out(output_optimize_model_dir)`\n" " `set_optimize_out(output_optimize_model_dir)`\n"
" `set_valid_places(arm|opencl|x86|npu|xpu)`\n" " `set_valid_places(arm|opencl|x86|npu|xpu|rknpu|apu)`\n"
" `run_optimize(false|true)`\n" " `run_optimize(false|true)`\n"
" ` ----fasle&true refer to whether to record ops info for " " ` ----fasle&true refer to whether to record ops info for "
"tailoring lib, false by default`\n" "tailoring lib, false by default`\n"
...@@ -208,6 +215,8 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) { ...@@ -208,6 +215,8 @@ void OptBase::PrintOpsInfo(const std::set<std::string>& valid_ops) {
"kFPGA", "kFPGA",
"kNPU", "kNPU",
"kXPU", "kXPU",
"kRKNPU",
"kAPU",
"kAny", "kAny",
"kUnk"}; "kUnk"};
// Get the lengh of the first column: maximum length of the op_type // Get the lengh of the first column: maximum length of the op_type
......
...@@ -36,11 +36,11 @@ TEST(CxxApi, run) { ...@@ -36,11 +36,11 @@ TEST(CxxApi, run) {
auto inputs = predictor->GetInputNames(); auto inputs = predictor->GetInputNames();
LOG(INFO) << "input size: " << inputs.size(); LOG(INFO) << "input size: " << inputs.size();
for (int i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
LOG(INFO) << "inputnames: " << inputs[i]; LOG(INFO) << "inputnames: " << inputs[i];
} }
auto outputs = predictor->GetOutputNames(); auto outputs = predictor->GetOutputNames();
for (int i = 0; i < outputs.size(); i++) { for (size_t i = 0; i < outputs.size(); i++) {
LOG(INFO) << "outputnames: " << outputs[i]; LOG(INFO) << "outputnames: " << outputs[i];
} }
auto input_tensor = predictor->GetInputByName(inputs[0]); auto input_tensor = predictor->GetInputByName(inputs[0]);
......
...@@ -18,20 +18,21 @@ ...@@ -18,20 +18,21 @@
*/ */
#pragma once #pragma once
#define USE_LITE_OP(op_type__) \ // some platform-independent defintion
extern int touch_op_##op_type__(); \ #include "lite/utils/macros.h"
int LITE_OP_REGISTER_FAKE(op_type__) __attribute__((unused)) = \
touch_op_##op_type__(); #define USE_LITE_OP(op_type__) \
extern int touch_op_##op_type__(); \
int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();
#define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \ #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
extern int touch_##op_type__##target__##precision__##layout__##alias__(); \ extern int touch_##op_type__##target__##precision__##layout__##alias__(); \
int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \ int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
__attribute__((unused)) = \ UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();
touch_##op_type__##target__##precision__##layout__##alias__();
#define USE_MIR_PASS(name__) \ #define USE_MIR_PASS(name__) \
extern bool mir_pass_registry##name__##_fake(); \ extern bool mir_pass_registry##name__##_fake(); \
static bool mir_pass_usage##name__ __attribute__((unused)) = \ static bool mir_pass_usage##name__ UNUSED = \
mir_pass_registry##name__##_fake(); mir_pass_registry##name__##_fake();
#define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__ #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
...@@ -73,7 +73,8 @@ const std::string& TargetToStr(TargetType target) { ...@@ -73,7 +73,8 @@ const std::string& TargetToStr(TargetType target) {
"xpu", "xpu",
"bm", "bm",
"mlu", "mlu",
"rknpu"}; "rknpu",
"apu"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -113,9 +114,10 @@ const std::string& TargetRepr(TargetType target) { ...@@ -113,9 +114,10 @@ const std::string& TargetRepr(TargetType target) {
"kFPGA", "kFPGA",
"kNPU", "kNPU",
"kXPU", "kXPU",
"kMLU",
"kBM", "kBM",
"kRKNPU"}; "kMLU",
"kRKNPU",
"kAPU"};
auto x = static_cast<int>(target); auto x = static_cast<int>(target);
CHECK_LT(x, static_cast<int>(TARGET(NUM))); CHECK_LT(x, static_cast<int>(TARGET(NUM)));
return target2string[x]; return target2string[x];
...@@ -158,6 +160,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) { ...@@ -158,6 +160,7 @@ std::set<TargetType> ExpandValidTargets(TargetType target) {
TARGET(kXPU), TARGET(kXPU),
TARGET(kBM), TARGET(kBM),
TARGET(kMLU), TARGET(kMLU),
TARGET(kAPU),
TARGET(kFPGA)}); TARGET(kFPGA)});
if (target == TARGET(kAny)) { if (target == TARGET(kAny)) {
return valid_set; return valid_set;
......
...@@ -49,14 +49,15 @@ enum class TargetType : int { ...@@ -49,14 +49,15 @@ enum class TargetType : int {
kCUDA = 3, kCUDA = 3,
kARM = 4, kARM = 4,
kOpenCL = 5, kOpenCL = 5,
kAny = 6, // any target
kFPGA = 7, kFPGA = 7,
kNPU = 8, kNPU = 8,
kXPU = 9, kXPU = 9,
kBM = 10, kBM = 10,
kMLU = 11, kMLU = 11,
kRKNPU = 12, kRKNPU = 12,
kAny = 6, // any target kAPU = 13,
NUM = 13, // number of fields. NUM = 14, // number of fields.
}; };
enum class PrecisionType : int { enum class PrecisionType : int {
kUnk = 0, kUnk = 0,
......
...@@ -49,6 +49,7 @@ USE_MIR_PASS(xpu_subgraph_pass); ...@@ -49,6 +49,7 @@ USE_MIR_PASS(xpu_subgraph_pass);
USE_MIR_PASS(mlu_subgraph_pass); USE_MIR_PASS(mlu_subgraph_pass);
USE_MIR_PASS(mlu_postprocess_pass); USE_MIR_PASS(mlu_postprocess_pass);
USE_MIR_PASS(weight_quantization_preprocess_pass); USE_MIR_PASS(weight_quantization_preprocess_pass);
USE_MIR_PASS(apu_subgraph_pass);
USE_MIR_PASS(quantized_op_attributes_inference_pass); USE_MIR_PASS(quantized_op_attributes_inference_pass);
USE_MIR_PASS(__xpu__resnet_fuse_pass); USE_MIR_PASS(__xpu__resnet_fuse_pass);
USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
...@@ -11,3 +11,12 @@ ...@@ -11,3 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import sys
if os.name =='nt':
current_path = os.path.abspath(os.path.dirname(__file__))
third_lib_path = current_path + os.sep + 'libs'
os.environ['path'] = third_lib_path+ ';' + os.environ['path']
sys.path.insert(0, third_lib_path)
...@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -3,7 +3,14 @@ if (NOT LITE_ON_TINY_PUBLISH)
set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base) set(PYBIND_DEPS ${PYBIND_DEPS} paddle_api_full opt_base)
endif() endif()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS}) if(WIN32)
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
target_link_libraries(lite_pybind ${os_dependency_modules})
else()
lite_cc_library(lite_pybind SHARED SRCS pybind.cc DEPS ${PYBIND_DEPS})
endif(WIN32)
if (LITE_ON_TINY_PUBLISH) if (LITE_ON_TINY_PUBLISH)
set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections") set_target_properties(lite_pybind PROPERTIES COMPILE_FLAGS "-flto -fdata-sections")
endif() endif()
...@@ -183,6 +183,8 @@ void BindLitePlace(py::module *m) { ...@@ -183,6 +183,8 @@ void BindLitePlace(py::module *m) {
.value("FPGA", TargetType::kFPGA) .value("FPGA", TargetType::kFPGA)
.value("NPU", TargetType::kNPU) .value("NPU", TargetType::kNPU)
.value("MLU", TargetType::kMLU) .value("MLU", TargetType::kMLU)
.value("RKNPU", TargetType::kRKNPU)
.value("APU", TargetType::kAPU)
.value("Any", TargetType::kAny); .value("Any", TargetType::kAny);
// PrecisionType // PrecisionType
......
...@@ -34,20 +34,27 @@ else: ...@@ -34,20 +34,27 @@ else:
# core lib of paddlelite is stored as lite.so # core lib of paddlelite is stored as lite.so
LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite' LITE_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/lite'
PACKAGE_DATA = {'paddlelite': ['lite.so']} PACKAGE_DATA = {'paddlelite': ['lite.so' if os.name!='nt' else 'lite.pyd']}
# put all thirdparty libraries in paddlelite.libs # put all thirdparty libraries in paddlelite.libs
PACKAGE_DATA['paddlelite.libs'] = [] PACKAGE_DATA['paddlelite.libs'] = []
LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs' LIB_PATH = '${PADDLE_BINARY_DIR}/inference_lite_lib/python/install/libs'
if '${WITH_MKL}' == 'ON': if '${WITH_MKL}' == 'ON':
shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH) shutil.copy('${MKLML_SHARED_IOMP_LIB}', LIB_PATH)
shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH) shutil.copy('${MKLML_SHARED_LIB}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so'] if os.name != 'nt':
PACKAGE_DATA['paddlelite.libs'] += ['libmklml_intel.so', 'libiomp5.so']
else:
PACKAGE_DATA['paddlelite.libs'] += ['libiomp5md.dll', 'mklml.dll']
shutil.copy('${MKLML_SHARED_LIB_DEPS}', LIB_PATH)
PACKAGE_DATA['paddlelite.libs'] += ['msvcr120.dll']
# link lite.so to paddlelite.libs # link lite.so to paddlelite.libs
COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\ if os.name != 'nt':
/inference_lite_lib/python/install/lite/lite.so" COMMAND = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}\
if os.system(COMMAND) != 0: /inference_lite_lib/python/install/lite/lite.so"
raise Exception("patch third_party libs failed, command: %s" % COMMAND) if os.system(COMMAND) != 0:
raise Exception("patch third_party libs failed, command: %s" % COMMAND)
# remove unused paddle/libs/__init__.py # remove unused paddle/libs/__init__.py
if os.path.isfile(LIB_PATH+'/__init__.py'): if os.path.isfile(LIB_PATH+'/__init__.py'):
...@@ -61,6 +68,14 @@ PACKAGE_DIR = { ...@@ -61,6 +68,14 @@ PACKAGE_DIR = {
'paddlelite': LITE_PATH 'paddlelite': LITE_PATH
} }
if os.name == 'nt':
# fix the path separator under windows
fix_package_dir = {}
for k, v in PACKAGE_DIR.items():
fix_package_dir[k] = v.replace('/', '\\')
PACKAGE_DIR = fix_package_dir
setup( setup(
name='paddlelite', name='paddlelite',
version=PADDLELITE_VERSION, version=PADDLELITE_VERSION,
......
...@@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) { ...@@ -38,7 +38,7 @@ TEST(CXXApi, test_lite_googlenet) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -69,7 +69,7 @@ TEST(CXXApi, test_lite_googlenet) { ...@@ -69,7 +69,7 @@ TEST(CXXApi, test_lite_googlenet) {
for (size_t i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5); EXPECT_NEAR(out->data<float>()[i * 51], results[i], 1e-5);
} }
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
} }
......
...@@ -15,7 +15,12 @@ ...@@ -15,7 +15,12 @@
#pragma once #pragma once
#include <gflags/gflags.h> #include <gflags/gflags.h>
#if !defined(_WIN32)
#include <sys/time.h> #include <sys/time.h>
#else
#include <windows.h>
#include "lite/backends/x86/port.h"
#endif
#include <time.h> #include <time.h>
#include <cmath> #include <cmath>
......
...@@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) { ...@@ -38,7 +38,7 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -69,13 +69,13 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) { ...@@ -69,13 +69,13 @@ TEST(InceptionV4, test_inceptionv4_lite_x86) {
0.0010612885, 0.00089107914, 0.0010112736, 0.00097655767})); 0.0010612885, 0.00089107914, 0.0010112736, 0.00097655767}));
auto out = predictor->GetOutput(0); auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
int step = 50; int step = 50;
for (int i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
for (int j = 0; j < results[i].size(); ++j) { for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)], EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j], results[i][j],
1e-6); 1e-6);
......
...@@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { ...@@ -38,7 +38,7 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -68,13 +68,13 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) { ...@@ -68,13 +68,13 @@ TEST(Mobilenet_v1, test_mobilenetv1_lite_x86) {
0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986, 0.0048292773, 0.0013995157, 0.0018453331, 0.0002428986,
0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722})); 0.00020211363, 0.00013668182, 0.0005855956, 0.00025901722}));
auto out = predictor->GetOutput(0); auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
int step = 50; int step = 50;
for (int i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
for (int j = 0; j < results[i].size(); ++j) { for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)], EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j], results[i][j],
1e-6); 1e-6);
......
...@@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { ...@@ -39,7 +39,7 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -69,13 +69,13 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) { ...@@ -69,13 +69,13 @@ TEST(Mobilenet_v2, test_mobilenetv2_lite_x86) {
0.0070957416, 0.0016094646, 0.0018807327, 0.00010506048, 0.0070957416, 0.0016094646, 0.0018807327, 0.00010506048,
6.823785e-05, 0.00012269315, 0.0007806194, 0.00022354358})); 6.823785e-05, 0.00012269315, 0.0007806194, 0.00022354358}));
auto out = predictor->GetOutput(0); auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
int step = 50; int step = 50;
for (int i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
for (int j = 0; j < results[i].size(); ++j) { for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)], EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j], results[i][j],
1e-6); 1e-6);
......
...@@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) { ...@@ -38,7 +38,7 @@ TEST(Resnet50, test_resnet50_lite_x86) {
input_tensor->Resize(input_shape); input_tensor->Resize(input_shape);
auto* data = input_tensor->mutable_data<float>(); auto* data = input_tensor->mutable_data<float>();
int input_num = 1; int input_num = 1;
for (int i = 0; i < input_shape.size(); ++i) { for (size_t i = 0; i < input_shape.size(); ++i) {
input_num *= input_shape[i]; input_num *= input_shape[i];
} }
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
...@@ -69,13 +69,13 @@ TEST(Resnet50, test_resnet50_lite_x86) { ...@@ -69,13 +69,13 @@ TEST(Resnet50, test_resnet50_lite_x86) {
0.006387163, 0.0037145028, 0.0012812682, 0.00045948103, 0.006387163, 0.0037145028, 0.0012812682, 0.00045948103,
0.00013535398, 0.0002483765, 0.00076759676, 0.0002773295})); 0.00013535398, 0.0002483765, 0.00076759676, 0.0002773295}));
auto out = predictor->GetOutput(0); auto out = predictor->GetOutput(0);
ASSERT_EQ(out->shape().size(), 2); ASSERT_EQ(out->shape().size(), 2u);
ASSERT_EQ(out->shape()[0], 1); ASSERT_EQ(out->shape()[0], 1);
ASSERT_EQ(out->shape()[1], 1000); ASSERT_EQ(out->shape()[1], 1000);
int step = 50; int step = 50;
for (int i = 0; i < results.size(); ++i) { for (size_t i = 0; i < results.size(); ++i) {
for (int j = 0; j < results[i].size(); ++j) { for (size_t j = 0; j < results[i].size(); ++j) {
EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)], EXPECT_NEAR(out->data<float>()[j * step + (out->shape()[1] * i)],
results[i][j], results[i][j],
1e-6); 1e-6);
......
...@@ -232,8 +232,8 @@ void TestModel(const std::vector<Place>& valid_places, ...@@ -232,8 +232,8 @@ void TestModel(const std::vector<Place>& valid_places,
for (int i = 0; i < outs->numel(); ++i) { for (int i = 0; i < outs->numel(); ++i) {
LOG(INFO) << o_data[i]; LOG(INFO) << o_data[i];
} }
for (int i = 0; i < lod.size(); ++i) { for (size_t i = 0; i < lod.size(); ++i) {
for (int j = 0; j < lod[i].size(); ++j) { for (size_t j = 0; j < lod[i].size(); ++j) {
LOG(INFO) << lod[i][j]; LOG(INFO) << lod[i][j];
} }
} }
......
...@@ -8,4 +8,5 @@ add_subdirectory(npu) ...@@ -8,4 +8,5 @@ add_subdirectory(npu)
add_subdirectory(xpu) add_subdirectory(xpu)
add_subdirectory(mlu) add_subdirectory(mlu)
add_subdirectory(bm) add_subdirectory(bm)
add_subdirectory(apu)
add_subdirectory(rknpu) add_subdirectory(rknpu)
if(NOT LITE_WITH_APU)
return()
endif()
lite_cc_library(device_apu SRCS device.cc)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/apu/device.h"
#include <dlfcn.h>
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace apu {
inline void* LoadFunc(void* libHandle, const char* name) {
CHECK(libHandle != nullptr);
CHECK(name != nullptr);
void* fn = dlsym(libHandle, name);
if (fn == nullptr) {
LOG(WARNING) << "Unable to open Neuron Runtime function [" << name
<< "] Because " << dlerror();
}
return fn;
}
NeuronCompilation* Device::Build(void* libHandle, NeuronModel* model) {
typedef int (*NeuronCompilation_create)(NeuronModel * model,
NeuronCompilation * *compilation);
typedef void (*NeuronCompilation_free)(NeuronCompilation * compilation);
typedef int (*NeuronCompilation_finish)(NeuronCompilation * compilation);
#define LOAD_FUNCTIONS(libHandle, FUNC_NAME, VARIABLE_NAME) \
FUNC_NAME VARIABLE_NAME = \
reinterpret_cast<FUNC_NAME>(LoadFunc(libHandle, #FUNC_NAME));
LOAD_FUNCTIONS(libHandle, NeuronCompilation_create, neuron_compilation_create)
LOAD_FUNCTIONS(libHandle, NeuronCompilation_free, neuron_compilation_free)
LOAD_FUNCTIONS(libHandle, NeuronCompilation_finish, neuron_compilation_finish)
#undef LOAD_FUNCTIONS
int neuron_errCode = 0;
NeuronCompilation* compilation = NULL;
VLOG(3) << "[APU] Compile model";
neuron_errCode = (*neuron_compilation_create)(model, &compilation);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "[APU] create compile failed! " << neuron_errCode;
return nullptr;
}
neuron_errCode = (*neuron_compilation_finish)(compilation);
if (NEURON_NO_ERROR != neuron_errCode) {
LOG(WARNING) << "[APU] compile failed! " << neuron_errCode;
return nullptr;
}
VLOG(3) << "[APU] Build done";
return compilation;
}
} // namespace apu
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "NeuronAdapter.h" // NOLINT
namespace paddle {
namespace lite {
namespace apu {
class Device {
public:
static Device& Global() {
static Device x;
return x;
}
Device() {}
NeuronCompilation* Build(void* libHandle, NeuronModel* model);
};
} // namespace apu
} // namespace lite
} // namespace paddle
...@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -80,8 +80,10 @@ void conv_compute_6x6_3x3(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -96,8 +98,8 @@ void conv_compute_6x6_3x3(const float* input,
int tile_h = (hout + 5) / 6; int tile_h = (hout + 5) / 6;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input, ...@@ -127,10 +129,10 @@ void conv_compute_6x6_3x3(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
...@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -367,8 +369,10 @@ void conv_compute_2x2_3x3(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -383,8 +387,8 @@ void conv_compute_2x2_3x3(const float* input,
int tile_h = (hout + 1) / 2; int tile_h = (hout + 1) / 2;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input, ...@@ -414,10 +418,10 @@ void conv_compute_2x2_3x3(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
...@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -628,8 +632,10 @@ void conv_compute_2x2_3x3_small(const float* input,
const operators::ConvParam& param, const operators::ConvParam& param,
ARMContext* ctx) { ARMContext* ctx) {
auto act_param = param.activation_param; auto act_param = param.activation_param;
const int pad_h = (*param.paddings)[0]; const int pad_h0 = (*param.paddings)[0];
const int pad_w = (*param.paddings)[2]; const int pad_h1 = (*param.paddings)[1];
const int pad_w0 = (*param.paddings)[2];
const int pad_w1 = (*param.paddings)[3];
float* tmp_work_space = float* tmp_work_space =
ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float); ctx->workspace_data<float>() + ctx->llc_size() / sizeof(float);
...@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -644,8 +650,8 @@ void conv_compute_2x2_3x3_small(const float* input,
int tile_h = (hout + 1) / 2; int tile_h = (hout + 1) / 2;
int size_tile = tile_h * tile_w; int size_tile = tile_h * tile_w;
int w_pad = win + pad_w * 2; int w_pad = win + pad_w0 + pad_w1;
int h_pad = hin + pad_h * 2; int h_pad = hin + pad_h0 + pad_h1;
const int zero_len = w_pad; const int zero_len = w_pad;
float zero_ptr[zero_len]; // NOLINT float zero_ptr[zero_len]; // NOLINT
...@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input, ...@@ -676,10 +682,10 @@ void conv_compute_2x2_3x3_small(const float* input,
prepack_input_nxwc4_dw(input + ni * in_n_stride, prepack_input_nxwc4_dw(input + ni * in_n_stride,
input_c4 + i * new_c_stride, input_c4 + i * new_c_stride,
i * 4, i * 4,
-pad_h, -pad_h0,
hin + pad_h, hin + pad_h1,
-pad_w, -pad_w0,
win + pad_w, win + pad_w1,
chin, chin,
win, win,
hin, hin,
......
...@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input, ...@@ -33,6 +33,7 @@ void add_bias_rowwise(Tensor* input,
for (int w = start_w; w < w_adds; ++w) { for (int w = start_w; w < w_adds; ++w) {
i_data[w] += b_data[w]; i_data[w] += b_data[w];
} }
i_data += width;
} }
} }
void vector_dot( void vector_dot(
...@@ -67,15 +68,8 @@ void vector_dot( ...@@ -67,15 +68,8 @@ void vector_dot(
for (int i = 0; i < remain; ++i) { for (int i = 0; i < remain; ++i) {
if (!v2) { if (!v2) {
out_ptr[i] = in_ptr[i] * v1_ptr[i]; out_ptr[i] = in_ptr[i] * v1_ptr[i];
++out_ptr;
++in_ptr;
++v1_ptr;
} else { } else {
out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i]; out_ptr[i] = in_ptr[i] + v1_ptr[i] * v2_ptr[i];
++out_ptr;
++in_ptr;
++v1_ptr;
++v2_ptr;
} }
} }
} }
......
...@@ -28,6 +28,7 @@ namespace lite { ...@@ -28,6 +28,7 @@ namespace lite {
class CLContext { class CLContext {
public: public:
~CLContext() { ~CLContext() {
GetCommandQueue().finish();
for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) { for (size_t kidx = 0; kidx < kernels_.size(); ++kidx) {
// Note(ysh329): Don't need `clReleaseKernel` // Note(ysh329): Don't need `clReleaseKernel`
kernels_[kidx].reset(); kernels_[kidx].reset();
......
...@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) { ...@@ -100,16 +100,18 @@ TEST(cl_test, kernel_test) {
size_t width = in_image.ImageWidth(); size_t width = in_image.ImageWidth();
size_t height = in_image.ImageHeight(); size_t height = in_image.ImageHeight();
auto global_work_size = cl::NDRange{width, height}; auto global_work_size = cl::NDRange{width, height};
cl::Event event;
status = context->GetCommandQueue().enqueueNDRangeKernel( status = context->GetCommandQueue().enqueueNDRangeKernel(
kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, &event); kernel, cl::NullRange, global_work_size, cl::NullRange, nullptr, nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
status = context->GetCommandQueue().finish(); status = context->GetCommandQueue().finish();
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
#if 0
double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); double start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); double stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
double elapsed_micros = (stop_nanos - start_nanos) / 1000.0; double elapsed_micros = (stop_nanos - start_nanos) / 1000.0;
LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us."; LOG(INFO) << "Kernel Run Cost Time: " << elapsed_micros << " us.";
#endif
LOG(INFO) << out_image; LOG(INFO) << out_image;
} }
......
...@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw, ...@@ -73,7 +73,7 @@ void CLImageConverterDefault::NCHWToImage(float *nchw,
i2 += 4; i2 += 4;
p++; p++;
} else { } else {
image[i2] = 0.0; image[i2] = Float2Half(0.f);
i2 += 4; i2 += 4;
} }
} }
...@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor, ...@@ -261,7 +261,7 @@ void CLImageConverterNWBlock::NCHWToImage(float *tensor,
image[index] = Float2Half(*p); image[index] = Float2Half(*p);
p++; p++;
} else { } else {
image[index] = 0.0; image[index] = Float2Half(0.f);
} }
if (index >= (width * height * 4)) { if (index >= (width * height * 4)) {
LOG(INFO) << " index out of range "; LOG(INFO) << " index out of range ";
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
///////////////////////////////// /////////////////////////////////
...@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in ...@@ -108,7 +107,8 @@ inline CL_DTYPE4 activation_type4(CL_DTYPE4 in
#endif #endif
#ifdef RELU6 #ifdef RELU6
output = clamp(in, (CL_DTYPE4)0, (CL_DTYPE4)6); in = fmax((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
output = fmin((CL_DTYPE4)(6.0f, 6.0f, 6.0f, 6.0f), in);
#endif #endif
return output; return output;
} }
...@@ -14,36 +14,30 @@ limitations under the License. */ ...@@ -14,36 +14,30 @@ limitations under the License. */
#include <cl_common.h> #include <cl_common.h>
__kernel void relu(__read_only image2d_t input, __kernel void relu(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width
const int y = get_global_id(1); // image_height
const int x = get_global_id(0); // image_width const sampler_t sampler =
const int y = get_global_id(1); // image_height CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP |
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f), in); in = max((CL_DTYPE4)(0.0f), in);
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void relu6(__read_only image2d_t input, __kernel void relu6(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale){ __private const float scale) {
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in); in = max((CL_DTYPE4)(0.0f, 0.0f, 0.0f, 0.0f), in);
...@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input, ...@@ -51,7 +45,6 @@ __kernel void relu6(__read_only image2d_t input,
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void sigmoid(__read_only image2d_t input, __kernel void sigmoid(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
...@@ -64,70 +57,66 @@ __kernel void sigmoid(__read_only image2d_t input, ...@@ -64,70 +57,66 @@ __kernel void sigmoid(__read_only image2d_t input,
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out; CL_DTYPE4 out;
out.x = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.x)));
out.y = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.y))); out.x = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.x))));
out.z = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.z))); out.y = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.y))));
out.w = 1.0 / (1.0 + pow(2.71828182, -1.0 * (float)(in.w))); out.z = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.z))));
out.w = (CL_DTYPE)(1.0f / (1.0f + pow(2.71828182f, -1.0f * (float)(in.w))));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
__kernel void leaky_relu(__read_only image2d_t input, __kernel void leaky_relu(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); const int x = get_global_id(0);
const int y = get_global_id(1); const int y = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in; CL_DTYPE4 s_val = CONVERT_TYPE_TO(scale, CL_DTYPE) * in;
if (in.x < 0.0f){ if (in.x < 0.0f) {
in.x = s_val.x; in.x = s_val.x;
} }
if (in.y < 0.0f){ if (in.y < 0.0f) {
in.y = s_val.y; in.y = s_val.y;
} }
if (in.z < 0.0f){ if (in.z < 0.0f) {
in.z = s_val.z; in.z = s_val.z;
} }
if (in.w < 0.0f){ if (in.w < 0.0f) {
in.w = s_val.w; in.w = s_val.w;
} }
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), in);
} }
__kernel void tanh_act(__read_only image2d_t input, __kernel void tanh_act(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width
const int x = get_global_id(0); // image_width const int y = get_global_id(1); // image_height
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out= (exp(in) - exp(-in))/ (exp(in) + exp(-in)); CL_DTYPE4 out = (exp(in) - exp(-in)) / (exp(in) + exp(-in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
__kernel void exp_act(__read_only image2d_t input, __kernel void exp_act(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width
const int x = get_global_id(0); // image_width const int y = get_global_id(1); // image_height
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = exp(in); CL_DTYPE4 out = exp(in);
...@@ -135,19 +124,16 @@ __kernel void exp_act(__read_only image2d_t input, ...@@ -135,19 +124,16 @@ __kernel void exp_act(__read_only image2d_t input,
} }
__kernel void swish(__read_only image2d_t input, __kernel void swish(__read_only image2d_t input,
__write_only image2d_t output, __write_only image2d_t output,
__private const float threshold, __private const float threshold,
__private const float scale) { __private const float scale) {
const int x = get_global_id(0); // image_width
const int x = get_global_id(0); // image_width const int y = get_global_id(1); // image_height
const int y = get_global_id(1); // image_height
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler =
CLK_ADDRESS_CLAMP | CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
CLK_FILTER_NEAREST;
CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y)); CL_DTYPE4 in = READ_IMG_TYPE(CL_DTYPE_CHAR, input, sampler, (int2)(x, y));
CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in)); CL_DTYPE4 out = in / (1 + exp(-(CL_DTYPE)scale * in));
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, (int2)(x, y), out);
} }
...@@ -54,10 +54,10 @@ __kernel void bilinear_interp(__read_only image2d_t input, ...@@ -54,10 +54,10 @@ __kernel void bilinear_interp(__read_only image2d_t input,
if (ceil_h > in_dims_h - 1) { if (ceil_h > in_dims_h - 1) {
ceil_h = in_dims_h- 1; ceil_h = in_dims_h- 1;
} }
float wight0_w = center_w - floor_w; CL_DTYPE wight0_w = center_w - floor_w;
float wight0_h = center_h - floor_h; CL_DTYPE wight0_h = center_h - floor_h;
float wight1_w = 1.0 - wight0_w; CL_DTYPE wight1_w = 1.0 - wight0_w;
float wight1_h = 1.0 - wight0_h; CL_DTYPE wight1_h = 1.0 - wight0_h;
const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
CLK_ADDRESS_CLAMP | CLK_ADDRESS_CLAMP |
...@@ -92,5 +92,6 @@ __kernel void bilinear_interp(__read_only image2d_t input, ...@@ -92,5 +92,6 @@ __kernel void bilinear_interp(__read_only image2d_t input,
CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h CL_DTYPE4 out = (left_down_data * wight1_w + right_down_data * wight0_w) * wight1_h
+ (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h; + (left_up_data * wight1_w + right_up_data * wight0_w) * wight0_h;
WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out); WRITE_IMG_TYPE(CL_DTYPE_CHAR, output, output_pos, out);
} }
#include <cl_common.h> #include <cl_common.h>
__kernel void conv2d_1x1_opt(__private const int global_size_dim0, __kernel void conv2d_1x1_opt(
__private const int global_size_dim1, __private const int global_size_dim0,
__private const int global_size_dim2, __private const int global_size_dim1,
__read_only image2d_t input_image, __private const int global_size_dim2,
__read_only image2d_t filter, __read_only image2d_t input_image,
__read_only image2d_t filter,
#if defined(BIASE_CH) || defined(BIASE_ELE) #if defined(BIASE_CH) || defined(BIASE_ELE)
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int stride, __private const int stride,
__private const int offset, __private const int offset,
__private const int input_c_block, __private const int input_c_block,
__private const int input_c_origin, __private const int input_c_origin,
__private const int dilation, __private const int dilation,
__private const int input_width, /* of one block */ __private const int input_width, /* of one block */
__private const int input_height, /* of one block */ __private const int input_height, /* of one block */
__private const int output_width, __private const int output_width,
__private const int output_height, __private const int output_height,
__private const int old_w) { __private const int old_w) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
...@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple( ...@@ -287,7 +288,7 @@ __kernel void conv2d_1x1_simple(
__read_only image2d_t bias, __read_only image2d_t bias,
#endif #endif
#ifdef BATCH_NORM #ifdef BATCH_NORM
__read_only image2d_t new_scale, __read_only image2d_t new_scale,
__read_only image2d_t new_biase, __read_only image2d_t new_biase,
#endif #endif
__write_only image2d_t output_image, __write_only image2d_t output_image,
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d // buffer -> image2d
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
__kernel void buffer_to_image2d(__global CL_DTYPE *in, __kernel void buffer_to_image2d(__global CL_DTYPE* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
__private const int out_W, __private const int out_W,
...@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -26,7 +26,6 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
__private const int Stride0, __private const int Stride0,
__private const int Stride1, __private const int Stride1,
__private const int Stride2) { __private const int Stride2) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
...@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in, ...@@ -66,16 +65,25 @@ __kernel void buffer_to_image2d(__global CL_DTYPE *in,
#ifdef DEBUG #ifdef DEBUG
if (out_w > 2045) { if (out_w > 2045) {
printf("out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f %.2f\n", printf(
out_w, "out_w:%d, out_C - 4 * out_c:%d, input[pos0~pos3]:%.2f %.2f %.2f "
out_C - 4 * out_c, "%.2f\n",
(float)(in[input_pos0]), out_w,
(float)(in[input_pos1]), out_C - 4 * out_c,
(float)(in[input_pos2]), (float)(in[input_pos0]),
(float)(in[input_pos3])); (float)(in[input_pos1]),
printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n", out_c, out_w, out_nh, (float)(in[input_pos2]),
output_pos.x, output_pos.y, (float)(in[input_pos3]));
(float)(output.x), (float)(output.y), (float)(output.z), (float)(output.w)); printf("buffer2image ===> %d,%d,%d, out(%d,%d): %.2f %.2f %.2f %.2f \n",
out_c,
out_w,
out_nh,
output_pos.x,
output_pos.y,
(float)(output.x),
(float)(output.y),
(float)(output.z),
(float)(output.w));
} }
#endif #endif
...@@ -101,34 +109,42 @@ __kernel void image2d_to_buffer(__read_only image2d_t input, ...@@ -101,34 +109,42 @@ __kernel void image2d_to_buffer(__read_only image2d_t input,
const int in_h = in_nh % in_height; const int in_h = in_nh % in_height;
const sampler_t sampler = const sampler_t sampler =
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w); const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)); CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh));
#ifdef DEBUG #ifdef DEBUG
if (in_w > 2045) { if (in_w > 2045) {
printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n", in_c, in_w, in_nh, printf("image2buffer ===> %d,%d,%d, in(%d,%d): %.2f %.2f %.2f %.2f \n",
pos_x, in_nh, in_c,
(float)(in.x), (float)(in.y), (float)(in.z), (float)(in.w)); in_w,
in_nh,
pos_x,
in_nh,
(float)(in.x),
(float)(in.y),
(float)(in.z),
(float)(in.w));
} }
#endif #endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; const int index =
in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE); out[index] = CONVERT_TYPE_TO(in.x, CL_DTYPE);
if (C - 4 * in_c >= 2) { if (C - 4 * in_c >= 2) {
out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE); out[index + size_ch] = CONVERT_TYPE_TO(in.y, CL_DTYPE);
} }
if(C - 4 * in_c >= 3) { if (C - 4 * in_c >= 3) {
out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE); out[index + size_ch * 2] = CONVERT_TYPE_TO(in.z, CL_DTYPE);
} }
if(C - 4 * in_c >= 4) { if (C - 4 * in_c >= 4) {
out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE); out[index + size_ch * 3] = CONVERT_TYPE_TO(in.w, CL_DTYPE);
} }
} }
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d_nw // buffer -> image2d_nw
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
...@@ -182,8 +198,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in, ...@@ -182,8 +198,7 @@ __kernel void buffer_to_image2d_nw(__global CL_DTYPE* in,
} }
#endif #endif
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
#if 0 // NOTE(ysh329): keep, un-used from paddle-mobile
// image2d -> buffer // image2d -> buffer
__kernel void image2d_to_buffer_2d(__private const int in_height, __kernel void image2d_to_buffer_2d(__private const int in_height,
__private const int in_width, __private const int in_width,
...@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height, ...@@ -208,15 +223,14 @@ __kernel void image2d_to_buffer_2d(__private const int in_height,
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// buffer -> image2d (divide by 255 to normalize) // buffer -> image2d (divide by 255 to normalize)
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
__kernel void buffer_to_image2d_with_pre255(__global uchar *in, __kernel void buffer_to_image2d_with_pre255(__global uchar* in,
__write_only image2d_t output_image, __write_only image2d_t output_image,
__private const int out_H, __private const int out_H,
__private const int out_W, __private const int out_W,
__private const int out_C, __private const int out_C,
__private const int Stride0, __private const int Stride0,
__private const int Stride1, __private const int Stride1,
__private const int Stride2){ __private const int Stride2) {
const int out_c = get_global_id(0); const int out_c = get_global_id(0);
const int out_w = get_global_id(1); const int out_w = get_global_id(1);
const int out_nh = get_global_id(2); const int out_nh = get_global_id(2);
...@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, ...@@ -231,7 +245,6 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
const int in_h = out_h; const int in_h = out_h;
const int in_w = out_w; const int in_w = out_w;
int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w; int input_pos0 = in_n * Stride2 + in_c0 * Stride1 + in_h * Stride0 + in_w;
int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w; int input_pos1 = in_n * Stride2 + in_c1 * Stride1 + in_h * Stride0 + in_w;
int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w; int input_pos2 = in_n * Stride2 + in_c2 * Stride1 + in_h * Stride0 + in_w;
...@@ -243,30 +256,29 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in, ...@@ -243,30 +256,29 @@ __kernel void buffer_to_image2d_with_pre255(__global uchar *in,
CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f; CL_COMPUTE_DTYPE4 output = (CL_COMPUTE_DTYPE4)0.0f;
output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255; output.x = CONVERT_TYPE_TO(in[input_pos0], CL_COMPUTE_DTYPE) / 255;
if(out_C - 4 * out_c>=2){ if (out_C - 4 * out_c >= 2) {
output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255; output.y = CONVERT_TYPE_TO(in[input_pos1], CL_COMPUTE_DTYPE) / 255;
} }
if(out_C - 4 * out_c>=3){ if (out_C - 4 * out_c >= 3) {
output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255; output.z = CONVERT_TYPE_TO(in[input_pos2], CL_COMPUTE_DTYPE) / 255;
} }
if(out_C - 4 * out_c>=4){ if (out_C - 4 * out_c >= 4) {
output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255; output.w = CONVERT_TYPE_TO(in[input_pos3], CL_COMPUTE_DTYPE) / 255;
} }
WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output); WRITE_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, output_image, output_pos, output);
} }
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
// image2d -> buffer (multiply by 255 to de-normalize) // image2d -> buffer (multiply by 255 to de-normalize)
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
__kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
__private const int in_width, __private const int in_width,
__private const int in_height, __private const int in_height,
__global uchar* out, __global uchar* out,
__private const int size_ch, __private const int size_ch,
__private const int size_block, __private const int size_block,
__private const int size_batch, __private const int size_batch,
__private const int C) { __private const int C) {
const int in_c = get_global_id(0); const int in_c = get_global_id(0);
const int in_w = get_global_id(1); const int in_w = get_global_id(1);
const int in_nh = get_global_id(2); const int in_nh = get_global_id(2);
...@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input, ...@@ -277,22 +289,34 @@ __kernel void image2d_to_buffer_with_post255(__read_only image2d_t input,
CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int pos_x = mad24(in_c, in_width, in_w); const int pos_x = mad24(in_c, in_width, in_w);
CL_COMPUTE_DTYPE4 in = READ_IMG_TYPE(CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) * 255; CL_COMPUTE_DTYPE4 in =
READ_IMG_TYPE(
CL_COMPUTE_DTYPE_CHAR, input, sampler, (int2)(pos_x, in_nh)) *
255;
#ifdef DEBUG #ifdef DEBUG
printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n", printf("in_c:%d, in_w:%d, in_nh:%d ===> in(%d,%d): %.2f %.2f %.2f %.2f\n",
in_c, in_w, in_nh, pos_x, in_nh, in.x, in.y, in.z, in.w); in_c,
in_w,
in_nh,
pos_x,
in_nh,
in.x,
in.y,
in.z,
in.w);
#endif #endif
const int index = in_n * size_batch + in_c * size_block + in_h * in_width + in_w; const int index =
in_n * size_batch + in_c * size_block + in_h * in_width + in_w;
out[index] = convert_uchar_sat(in.x); out[index] = convert_uchar_sat(in.x);
if(C - 4 * in_c>=2){ if (C - 4 * in_c >= 2) {
out[index + size_ch] = convert_uchar_sat(in.y); out[index + size_ch] = convert_uchar_sat(in.y);
} }
if(C - 4 * in_c>=3){ if (C - 4 * in_c >= 3) {
out[index + size_ch * 2] = convert_uchar_sat(in.z); out[index + size_ch * 2] = convert_uchar_sat(in.z);
} }
if(C - 4 * in_c>=4){ if (C - 4 * in_c >= 4) {
out[index + size_ch * 3] = convert_uchar_sat(in.w); out[index + size_ch * 3] = convert_uchar_sat(in.w);
} }
} }
...@@ -45,6 +45,9 @@ bool CLRuntime::Init() { ...@@ -45,6 +45,9 @@ bool CLRuntime::Init() {
bool is_device_init = InitializeDevice(); bool is_device_init = InitializeDevice();
is_init_success_ = is_platform_init && is_device_init; is_init_success_ = is_platform_init && is_device_init;
initialized_ = true; initialized_ = true;
context_ = CreateContext();
command_queue_ = CreateCommandQueue(context());
return initialized_; return initialized_;
} }
...@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() { ...@@ -55,7 +58,7 @@ cl::Platform& CLRuntime::platform() {
cl::Context& CLRuntime::context() { cl::Context& CLRuntime::context() {
if (context_ == nullptr) { if (context_ == nullptr) {
context_ = CreateContext(); LOG(FATAL) << "context_ create failed. ";
} }
return *context_; return *context_;
} }
...@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() { ...@@ -67,7 +70,7 @@ cl::Device& CLRuntime::device() {
cl::CommandQueue& CLRuntime::command_queue() { cl::CommandQueue& CLRuntime::command_queue() {
if (command_queue_ == nullptr) { if (command_queue_ == nullptr) {
command_queue_ = CreateCommandQueue(context()); LOG(FATAL) << "command_queue_ create failed. ";
} }
return *command_queue_; return *command_queue_;
} }
...@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent( ...@@ -96,7 +99,7 @@ std::unique_ptr<cl::UserEvent> CLRuntime::CreateEvent(
bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) { bool CLRuntime::BuildProgram(cl::Program* program, const std::string& options) {
/* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/ /* -I +CLRuntime::Global()->cl_path() + "/cl_kernel"*/
std::string build_option = options + " -cl-fast-relaxed-math "; std::string build_option = options + " -cl-fast-relaxed-math -cl-mad-enable";
VLOG(4) << "OpenCL build_option: " << build_option; VLOG(4) << "OpenCL build_option: " << build_option;
status_ = program->build({*device_}, build_option.c_str()); status_ = program->build({*device_}, build_option.c_str());
CL_CHECK_ERROR(status_); CL_CHECK_ERROR(status_);
......
...@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width, ...@@ -66,7 +66,8 @@ void *TargetWrapperCL::MallocImage<float>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width, ...@@ -89,7 +90,8 @@ void *TargetWrapperCL::MallocImage<uint16_t>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width, ...@@ -112,7 +114,8 @@ void *TargetWrapperCL::MallocImage<int32_t>(const size_t cl_image2d_width,
cl_int status; cl_int status;
cl::Image2D *cl_image = cl::Image2D *cl_image =
new cl::Image2D(CLRuntime::Global()->context(), new cl::Image2D(CLRuntime::Global()->context(),
CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR : 0), CL_MEM_READ_WRITE | (host_ptr ? CL_MEM_COPY_HOST_PTR
: CL_MEM_ALLOC_HOST_PTR),
img_format, img_format,
cl_image2d_width, cl_image2d_width,
cl_image2d_height, cl_image2d_height,
...@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst, ...@@ -192,7 +195,6 @@ void TargetWrapperCL::MemcpySync(void *dst,
size_t size, size_t size,
IoDirection dir) { IoDirection dir) {
cl_int status; cl_int status;
cl::Event event;
auto stream = CLRuntime::Global()->command_queue(); auto stream = CLRuntime::Global()->command_queue();
switch (dir) { switch (dir) {
case IoDirection::DtoD: case IoDirection::DtoD:
...@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst, ...@@ -202,9 +204,9 @@ void TargetWrapperCL::MemcpySync(void *dst,
0, 0,
size, size,
nullptr, nullptr,
&event); nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
event.wait(); CLRuntime::Global()->command_queue().finish();
break; break;
case IoDirection::HtoD: case IoDirection::HtoD:
status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst), status = stream.enqueueWriteBuffer(*static_cast<cl::Buffer *>(dst),
...@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -283,7 +285,6 @@ void TargetWrapperCL::ImgcpySync(void *dst,
cl::array<size_t, 3> origin = {0, 0, 0}; cl::array<size_t, 3> origin = {0, 0, 0};
cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1}; cl::array<size_t, 3> region = {cl_image2d_width, cl_image2d_height, 1};
cl_int status; cl_int status;
cl::Event event;
auto stream = CLRuntime::Global()->command_queue(); auto stream = CLRuntime::Global()->command_queue();
switch (dir) { switch (dir) {
case IoDirection::DtoD: case IoDirection::DtoD:
...@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst, ...@@ -293,9 +294,9 @@ void TargetWrapperCL::ImgcpySync(void *dst,
origin, origin,
region, region,
nullptr, nullptr,
&event); nullptr);
CL_CHECK_FATAL(status); CL_CHECK_FATAL(status);
event.wait(); CLRuntime::Global()->command_queue().finish();
break; break;
case IoDirection::HtoD: case IoDirection::HtoD:
status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst), status = stream.enqueueWriteImage(*static_cast<cl::Image2D *>(dst),
......
...@@ -40,7 +40,7 @@ void MatMulJitCode::genCode() { ...@@ -40,7 +40,7 @@ void MatMulJitCode::genCode() {
for (size_t g = 0; g < groups.size(); ++g) { for (size_t g = 0; g < groups.size(); ++g) {
size_t x_offset = 0; size_t x_offset = 0;
size_t wgt_offset_tmp = 0; size_t wgt_offset_tmp = 0;
for (int i = 0; i < g; ++i) { for (size_t i = 0; i < g; ++i) {
wgt_offset_tmp += groups[i] * block_len; wgt_offset_tmp += groups[i] * block_len;
} }
for (int k = 0; k < k_; ++k) { for (int k = 0; k < k_; ++k) {
......
...@@ -28,6 +28,12 @@ ...@@ -28,6 +28,12 @@
#define posix_memalign_free free #define posix_memalign_free free
#endif #endif
#ifdef _WIN32
#define posix_memalign_free _aligned_free
#define posix_memalign(p, a, s) \
(((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
#endif
// DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file"); // DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode"); bool dump_jitcode = paddle::lite::GetBoolFromEnv("dump_jitcode");
...@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const { ...@@ -53,10 +59,14 @@ void GenBase::dumpCode(const unsigned char* code) const {
void* GenBase::operator new(size_t size) { void* GenBase::operator new(size_t size) {
void* ptr; void* ptr;
constexpr size_t alignment = 32ul; constexpr size_t alignment = 32ul;
#ifdef _WIN32
ptr = _aligned_malloc(size, alignment);
#else
PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size), PADDLE_ENFORCE_EQ(posix_memalign(&ptr, alignment, size),
0, 0,
"GenBase Alloc %ld error!", "GenBase Alloc %ld error!",
size); size);
#endif
PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size); PADDLE_ENFORCE(ptr, "Fail to allocate GenBase CPU memory: size = %d .", size);
return ptr; return ptr;
} }
......
...@@ -265,7 +265,7 @@ class BeamSearchFunctor<TARGET(kX86), T> { ...@@ -265,7 +265,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
// size_t num_seqs = scores->NumElements(lod_level); // size_t num_seqs = scores->NumElements(lod_level);
size_t num_seqs = scores->lod()[lod_level].size() - 1; size_t num_seqs = scores->lod()[lod_level].size() - 1;
size_t seq_width = 1; size_t seq_width = 1;
for (int i = 1; i < scores->dims().size(); i++) { for (size_t i = 1; i < scores->dims().size(); i++) {
seq_width *= scores->dims()[i]; seq_width *= scores->dims()[i];
} }
......
...@@ -23,7 +23,7 @@ namespace math { ...@@ -23,7 +23,7 @@ namespace math {
MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim, MatDescriptor CreateMatrixDescriptor(const lite::DDimLite &tensor_dim,
int num_flatten_cols, int num_flatten_cols,
bool trans) { bool trans) {
PADDLE_ENFORCE_GT(tensor_dim.size(), 1); PADDLE_ENFORCE_GT(tensor_dim.size(), 1u);
MatDescriptor retv; MatDescriptor retv;
if (num_flatten_cols > 1) { if (num_flatten_cols > 1) {
auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols); auto flatten_dim = tensor_dim.Flatten2D(num_flatten_cols);
......
...@@ -46,9 +46,9 @@ class MaxSeqPoolFunctor { ...@@ -46,9 +46,9 @@ class MaxSeqPoolFunctor {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto out_dims = output->dims(); auto out_dims = output->dims();
auto idx_dims = index->dims(); auto idx_dims = index->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1); PADDLE_ENFORCE_GT(in_dims.size(), 1u);
PADDLE_ENFORCE_GT(out_dims.size(), 1); PADDLE_ENFORCE_GT(out_dims.size(), 1u);
for (int64_t i = 1; i < in_dims.size(); ++i) { for (size_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
} }
PADDLE_ENFORCE_EQ(idx_dims, out_dims); PADDLE_ENFORCE_EQ(idx_dims, out_dims);
...@@ -95,9 +95,9 @@ class MaxSeqPoolFunctor<T, true> { ...@@ -95,9 +95,9 @@ class MaxSeqPoolFunctor<T, true> {
lite::Tensor* index) { lite::Tensor* index) {
auto in_dims = input.dims(); auto in_dims = input.dims();
auto out_dims = output->dims(); auto out_dims = output->dims();
PADDLE_ENFORCE_GT(in_dims.size(), 1); PADDLE_ENFORCE_GT(in_dims.size(), 1u);
PADDLE_ENFORCE_GT(out_dims.size(), 1); PADDLE_ENFORCE_GT(out_dims.size(), 1u);
for (int64_t i = 1; i < in_dims.size(); ++i) { for (size_t i = 1; i < in_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
} }
...@@ -138,7 +138,7 @@ class MaxSeqPoolGradFunctor { ...@@ -138,7 +138,7 @@ class MaxSeqPoolGradFunctor {
auto idx_dims = index.dims(); auto idx_dims = index.dims();
PADDLE_ENFORCE_GT(og_dims.size(), 1); PADDLE_ENFORCE_GT(og_dims.size(), 1);
PADDLE_ENFORCE_GT(ig_dims.size(), 1); PADDLE_ENFORCE_GT(ig_dims.size(), 1);
for (int64_t i = 1; i < og_dims.size(); ++i) { for (size_t i = 1; i < og_dims.size(); ++i) {
PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
} }
PADDLE_ENFORCE_EQ(idx_dims, og_dims); PADDLE_ENFORCE_EQ(idx_dims, og_dims);
......
...@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() { ...@@ -38,7 +38,7 @@ static inline int64_t GetMaxThreads() {
// Do not support nested omp parallem. // Do not support nested omp parallem.
num_threads = omp_in_parallel() ? 1 : omp_get_max_threads(); num_threads = omp_in_parallel() ? 1 : omp_get_max_threads();
#endif #endif
return std::max(num_threads, 1L); return std::max<int>(num_threads, 1L);
} }
using ThreadHandler = using ThreadHandler =
......
...@@ -14,10 +14,10 @@ ...@@ -14,10 +14,10 @@
#pragma once #pragma once
#include <time.h>
#include <cstdio> #include <cstdio>
#include <stdexcept> #include <stdexcept>
#include <time.h>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -37,7 +37,9 @@ ...@@ -37,7 +37,9 @@
#define GOOGLE_GLOG_DLL_DECL #define GOOGLE_GLOG_DLL_DECL
#include <io.h> // _popen, _pclose #include <io.h> // _popen, _pclose
#include <stdio.h> #include <stdio.h>
#define NOMINMAX // msvc max/min macro conflict with std::min/max
#include <windows.h> #include <windows.h>
#include <winsock.h>
#include <numeric> // std::accumulate in msvc #include <numeric> // std::accumulate in msvc
#ifndef S_ISDIR // windows port for sys/stat.h #ifndef S_ISDIR // windows port for sys/stat.h
#define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR) #define S_ISDIR(mode) (((mode)&S_IFMT) == S_IFDIR)
...@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) { ...@@ -62,6 +64,7 @@ static void *dlopen(const char *filename, int flag) {
return reinterpret_cast<void *>(hModule); return reinterpret_cast<void *>(hModule);
} }
extern struct timeval;
static int gettimeofday(struct timeval *tp, void *tzp) { static int gettimeofday(struct timeval *tp, void *tzp) {
time_t clock; time_t clock;
struct tm tm; struct tm tm;
......
...@@ -24,13 +24,8 @@ if (NOT LITE_ON_TINY_PUBLISH) ...@@ -24,13 +24,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
endif() endif()
if (LITE_WITH_X86)
lite_cc_library(variable SRCS variable.cc DEPS tensor) lite_cc_library(variable SRCS variable.cc DEPS tensor)
lite_cc_library(types SRCS types.cc) lite_cc_library(types SRCS types.cc)
else()
lite_cc_library(variable SRCS variable.cc DEPS tensor)
lite_cc_library(types SRCS types.cc)
endif()
lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel) lite_cc_library(op_registry SRCS op_registry.cc DEPS kernel)
lite_cc_library(scope SRCS scope.cc DEPS tensor) lite_cc_library(scope SRCS scope.cc DEPS tensor)
lite_cc_library(device_info SRCS device_info.cc DEPS tensor) lite_cc_library(device_info SRCS device_info.cc DEPS tensor)
......
...@@ -107,7 +107,7 @@ void TestCase::PrepareInputsForInstruction() { ...@@ -107,7 +107,7 @@ void TestCase::PrepareInputsForInstruction() {
CHECK(!shared_tensor_array->empty()) CHECK(!shared_tensor_array->empty())
<< "shared_tensor_array is empty yet"; << "shared_tensor_array is empty yet";
target_tensor_array->resize(shared_tensor_array->size()); target_tensor_array->resize(shared_tensor_array->size());
for (int i = 0; i < shared_tensor_array->size(); i++) { for (size_t i = 0; i < shared_tensor_array->size(); i++) {
target_tensor_array->at(i).Resize( target_tensor_array->at(i).Resize(
shared_tensor_array->at(i).dims()); shared_tensor_array->at(i).dims());
TargetCopy(param_type->type->target(), TargetCopy(param_type->type->target(),
...@@ -219,7 +219,7 @@ bool TestCase::CheckPrecision(const std::string& var_name, ...@@ -219,7 +219,7 @@ bool TestCase::CheckPrecision(const std::string& var_name,
auto b_tensor_array = auto b_tensor_array =
base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>(); base_scope_->FindVar(var_name)->GetMutable<std::vector<Tensor>>();
CHECK_EQ(a_tensor_array->size(), b_tensor_array->size()); CHECK_EQ(a_tensor_array->size(), b_tensor_array->size());
for (int i = 0; i < a_tensor_array->size(); i++) { for (size_t i = 0; i < a_tensor_array->size(); i++) {
Tensor* a_tensor = &(a_tensor_array->at(i)); Tensor* a_tensor = &(a_tensor_array->at(i));
Tensor* b_tensor = &(b_tensor_array->at(i)); Tensor* b_tensor = &(b_tensor_array->at(i));
if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) { if (a_tensor->dims().size() == 0 && b_tensor->dims().size() == 0) {
......
...@@ -166,7 +166,7 @@ class TestCase { ...@@ -166,7 +166,7 @@ class TestCase {
// TODO(Superjomn) Move this method to utils or DDim? // TODO(Superjomn) Move this method to utils or DDim?
bool ShapeEquals(const DDim& a, const DDim& b) { bool ShapeEquals(const DDim& a, const DDim& b) {
if (a.size() != b.size()) return false; if (a.size() != b.size()) return false;
for (int i = 0; i < a.size(); i++) { for (size_t i = 0; i < a.size(); i++) {
if (a[i] != b[i]) return false; if (a[i] != b[i]) return false;
} }
return true; return true;
......
...@@ -54,6 +54,7 @@ using HostContext = Context<TargetType::kHost>; ...@@ -54,6 +54,7 @@ using HostContext = Context<TargetType::kHost>;
using X86Context = Context<TargetType::kX86>; using X86Context = Context<TargetType::kX86>;
using ARMContext = Context<TargetType::kARM>; using ARMContext = Context<TargetType::kARM>;
using NPUContext = Context<TargetType::kNPU>; using NPUContext = Context<TargetType::kNPU>;
using APUContext = Context<TargetType::kAPU>;
using XPUContext = Context<TargetType::kXPU>; using XPUContext = Context<TargetType::kXPU>;
using OpenCLContext = Context<TargetType::kOpenCL>; using OpenCLContext = Context<TargetType::kOpenCL>;
using FPGAContext = Context<TargetType::kFPGA>; using FPGAContext = Context<TargetType::kFPGA>;
...@@ -87,6 +88,21 @@ class Context<TargetType::kNPU> { ...@@ -87,6 +88,21 @@ class Context<TargetType::kNPU> {
}; };
#endif #endif
#ifdef LITE_WITH_APU
template <>
class Context<TargetType::kAPU> {
public:
Context() {}
explicit Context(const APUContext& ctx);
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void CopySharedTo(APUContext* ctx) {}
APUContext& operator=(const APUContext& ctx) {}
std::string name() const { return "APUContext"; }
};
#endif
#ifdef LITE_WITH_BM #ifdef LITE_WITH_BM
template <> template <>
class Context<TargetType::kBM> { class Context<TargetType::kBM> {
...@@ -324,27 +340,17 @@ class Context<TargetType::kX86> { ...@@ -324,27 +340,17 @@ class Context<TargetType::kX86> {
template <> template <>
class Context<TargetType::kOpenCL> { class Context<TargetType::kOpenCL> {
std::shared_ptr<CLContext> cl_context_; std::shared_ptr<CLContext> cl_context_;
using WaitListType =
std::unordered_map<decltype(static_cast<const void*>(nullptr)),
std::shared_ptr<cl::Event>>;
std::shared_ptr<WaitListType> cl_wait_list_;
public: public:
CLContext* cl_context() { return cl_context_.get(); } CLContext* cl_context() { return cl_context_.get(); }
WaitListType* cl_wait_list() { return cl_wait_list_.get(); }
void InitOnce() { void InitOnce() {
// Init cl runtime. // Init cl runtime.
CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed"; CHECK(CLRuntime::Global()->IsInitSuccess()) << "OpenCL runtime init failed";
cl_context_ = std::make_shared<CLContext>(); cl_context_ = std::make_shared<CLContext>();
cl_wait_list_ = std::make_shared<WaitListType>();
} }
void CopySharedTo(OpenCLContext* ctx) { void CopySharedTo(OpenCLContext* ctx) { ctx->cl_context_ = cl_context_; }
ctx->cl_context_ = cl_context_;
ctx->cl_wait_list_ = cl_wait_list_;
}
}; };
#endif #endif
...@@ -408,6 +414,12 @@ class ContextScheduler { ...@@ -408,6 +414,12 @@ class ContextScheduler {
&ctx->As<NPUContext>()); &ctx->As<NPUContext>());
break; break;
#endif #endif
#ifdef LITE_WITH_APU
case TARGET(kAPU):
kernel_contexts_[TargetType::kAPU].As<APUContext>().CopySharedTo(
&ctx->As<APUContext>());
break;
#endif
#ifdef LITE_WITH_RKNPU #ifdef LITE_WITH_RKNPU
case TARGET(kRKNPU): case TARGET(kRKNPU):
kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo( kernel_contexts_[TargetType::kRKNPU].As<RKNPUContext>().CopySharedTo(
...@@ -483,6 +495,9 @@ class ContextScheduler { ...@@ -483,6 +495,9 @@ class ContextScheduler {
#ifdef LITE_WITH_NPU #ifdef LITE_WITH_NPU
InitContext<TargetType::kNPU, NPUContext>(); InitContext<TargetType::kNPU, NPUContext>();
#endif #endif
#ifdef LITE_WITH_APU
InitContext<TargetType::kAPU, APUContext>();
#endif
#ifdef LITE_WITH_RKNPU #ifdef LITE_WITH_RKNPU
InitContext<TargetType::kRKNPU, RKNPUContext>(); InitContext<TargetType::kRKNPU, RKNPUContext>();
#endif #endif
......
...@@ -947,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) { ...@@ -947,7 +947,7 @@ void DeviceInfo::RequestPowerNoBindMode(int thread_num) {
active_ids_ = core_ids_; active_ids_ = core_ids_;
} else { } else {
active_ids_.resize(thread_num); active_ids_.resize(thread_num);
for (int i = 0; i < thread_num; ++i) { for (uint32_t i = 0; i < thread_num; ++i) {
if (i < big_core_ids_.size()) { if (i < big_core_ids_.size()) {
active_ids_[i] = big_core_ids_[i]; active_ids_[i] = big_core_ids_[i];
} else { } else {
......
...@@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type, ...@@ -57,7 +57,7 @@ void KernelBase::ParseKernelType(const std::string &kernel_type,
std::string *alias, std::string *alias,
Place *place) { Place *place) {
auto parts = Split(kernel_type, "/"); auto parts = Split(kernel_type, "/");
CHECK_EQ(parts.size(), 5); CHECK_EQ(parts.size(), 5u);
*op_type = parts[0]; *op_type = parts[0];
*alias = parts[1]; *alias = parts[1];
......
...@@ -163,23 +163,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -163,23 +163,23 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3]; conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) { for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
for (unsigned int i = 0; i < h; ++i) { for (int i = 0; i < h; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]); weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) { if (alpha_data[i] < 0.f) {
auto ptr_row = conv_weight_d + k * c_size + i * hw; auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (unsigned int j = 0; j < hw; ++j) { for (int j = 0; j < hw; ++j) {
ptr_row[j] *= -1; ptr_row[j] *= -1;
} }
} }
} }
} }
} else { } else {
for (unsigned int i = 0; i < h; ++i) { for (int i = 0; i < h; ++i) {
weight_scale[i] *= fabsf(alpha_data[i]); weight_scale[i] *= fabsf(alpha_data[i]);
if (alpha_data[i] < 0.f) { if (alpha_data[i] < 0.f) {
auto ptr_row = conv_weight_d + i * w; auto ptr_row = conv_weight_d + i * w;
for (unsigned int j = 0; j < w; ++j) { for (int j = 0; j < w; ++j) {
ptr_row[j] *= -1; ptr_row[j] *= -1;
} }
} }
...@@ -203,17 +203,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) { ...@@ -203,17 +203,17 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] * int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
conv_weight_t->dims()[3]; conv_weight_t->dims()[3];
int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3]; int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
for (unsigned int k = 0; k < conv_weight_t->dims()[0]; ++k) { for (int k = 0; k < conv_weight_t->dims()[0]; ++k) {
for (unsigned int i = 0; i < h; ++i) { for (int i = 0; i < h; ++i) {
auto ptr_row = conv_weight_d + k * c_size + i * hw; auto ptr_row = conv_weight_d + k * c_size + i * hw;
for (unsigned int j = 0; j < hw; ++j) { for (int j = 0; j < hw; ++j) {
ptr_row[j] *= alpha_data[i]; ptr_row[j] *= alpha_data[i];
} }
} }
} }
} else { } else {
for (unsigned int i = 0; i < h; ++i) { // n: conv2d output channels for (int i = 0; i < h; ++i) { // n: conv2d output channels
for (unsigned int j = 0; j < w; ++j) { // w: conv2d input channels for (int j = 0; j < w; ++j) { // w: conv2d input channels
conv_weight_d[i * w + j] *= alpha_data[i]; conv_weight_d[i * w + j] *= alpha_data[i];
} }
} }
......
...@@ -260,7 +260,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph, ...@@ -260,7 +260,7 @@ void ChannelWiseDequantOpFuser::InsertNewNode(SSAGraph* graph,
auto channel_scale_tensor = auto channel_scale_tensor =
scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>(); scope->FindVar(channel_scale_name)->GetMutable<lite::Tensor>();
auto* channel_scale_data = channel_scale_tensor->data<float>(); auto* channel_scale_data = channel_scale_tensor->data<float>();
for (int i = 0; i < channel_scale_tensor->data_size(); i++) { for (size_t i = 0; i < channel_scale_tensor->data_size(); i++) {
weight_scale.push_back(channel_scale_data[i] / range); weight_scale.push_back(channel_scale_data[i] / range);
} }
......
...@@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -313,4 +313,8 @@ void MemoryOptimizePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass) REGISTER_MIR_PASS(memory_optimize_pass, paddle::lite::mir::MemoryOptimizePass)
.BindTargets({TARGET(kARM), TARGET(kOpenCL)}) .BindTargets({TARGET(kARM), TARGET(kOpenCL)})
.ExcludeTargets({TARGET(kNPU), TARGET(kXPU), TARGET(kBM), TARGET(kRKNPU)}); .ExcludeTargets({TARGET(kNPU),
TARGET(kXPU),
TARGET(kBM),
TARGET(kRKNPU),
TARGET(kAPU)});
...@@ -292,7 +292,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node, ...@@ -292,7 +292,7 @@ void MLUPostprocessPass::GetSubgraphOpArgType(Node* inst_node,
// get subgraph op's type info // get subgraph op's type info
size_t kernel_size = inst_node->AsStmt().kernels().size(); size_t kernel_size = inst_node->AsStmt().kernels().size();
CHECK_GT(kernel_size, 0); CHECK_GT(kernel_size, 0u);
VLOG(4) << "subgraph kernel size: " << kernel_size; VLOG(4) << "subgraph kernel size: " << kernel_size;
for (size_t i = 0; i < kernel_size; ++i) { for (size_t i = 0; i < kernel_size; ++i) {
...@@ -450,7 +450,7 @@ bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) { ...@@ -450,7 +450,7 @@ bool MLUPostprocessPass::IsFirstConvInSubgraph(Node* arg_node, Node* inst) {
auto* block_desc = auto* block_desc =
static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get()) static_cast<operators::SubgraphOp*>(inst->AsStmt().op().get())
->GetSubBlock(); ->GetSubBlock();
for (int op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) { for (size_t op_idx = 0; op_idx < block_desc->OpsSize(); op_idx++) {
auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx); auto op_desc = block_desc->GetOp<cpp::OpDesc>(op_idx);
CHECK(op_desc); CHECK(op_desc);
if (op_desc->Type() == "conv2d") { if (op_desc->Type() == "conv2d") {
......
...@@ -59,6 +59,9 @@ class PassRegistry { ...@@ -59,6 +59,9 @@ class PassRegistry {
} // namespace lite } // namespace lite
} // namespace paddle } // namespace paddle
// some platform-independent defintion
#include "lite/utils/macros.h"
#define REGISTER_MIR_PASS(name__, class__) \ #define REGISTER_MIR_PASS(name__, class__) \
paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__, \ paddle::lite::mir::PassRegistry mir_pass_registry##name__(#name__, \
new class__); \ new class__); \
...@@ -66,4 +69,4 @@ class PassRegistry { ...@@ -66,4 +69,4 @@ class PassRegistry {
return mir_pass_registry##name__.Touch(); \ return mir_pass_registry##name__.Touch(); \
} \ } \
static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__ \ static paddle::lite::mir::PassRegistry mir_pass_registry_func_##name__ \
__attribute__((unused)) = mir_pass_registry##name__ UNUSED = mir_pass_registry##name__
...@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply( ...@@ -77,4 +77,4 @@ void QuantizedOpAttributesInferencePass::Apply(
REGISTER_MIR_PASS(quantized_op_attributes_inference_pass, REGISTER_MIR_PASS(quantized_op_attributes_inference_pass,
paddle::lite::mir::QuantizedOpAttributesInferencePass) paddle::lite::mir::QuantizedOpAttributesInferencePass)
.BindTargets({TARGET(kNPU), TARGET(kRKNPU)}); .BindTargets({TARGET(kAPU), TARGET(kRKNPU)});
...@@ -47,8 +47,8 @@ std::string SubgraphVisualizer::operator()() { ...@@ -47,8 +47,8 @@ std::string SubgraphVisualizer::operator()() {
"turquoise4", "snow3", "sienna4", "salmon2", "turquoise4", "snow3", "sienna4", "salmon2",
}; };
std::unordered_map<Node *, int> subgraph_indices; std::unordered_map<Node *, int> subgraph_indices;
for (int i = 0; i < subgraphs_.size(); i++) { for (size_t i = 0; i < subgraphs_.size(); i++) {
for (int j = 0; j < subgraphs_[i].size(); j++) { for (size_t j = 0; j < subgraphs_[i].size(); j++) {
subgraph_indices[subgraphs_[i][j]] = i; subgraph_indices[subgraphs_[i][j]] = i;
} }
} }
...@@ -538,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph, ...@@ -538,7 +538,8 @@ void SubgraphFuser::ReplaceNodesWithSubgraphs(SSAGraph *graph,
std::vector<std::vector<Node *>> subgraphs = std::vector<std::vector<Node *>> subgraphs =
SubgraphDetector(graph, teller)(); SubgraphDetector(graph, teller)();
SubgraphVisualizer(graph, subgraphs)(); SubgraphVisualizer(graph, subgraphs)();
for (int subgraph_idx = 0; subgraph_idx < subgraphs.size(); subgraph_idx++) { for (size_t subgraph_idx = 0; subgraph_idx < subgraphs.size();
subgraph_idx++) {
if (subgraphs[subgraph_idx].size() >= min_subgraph_size) { if (subgraphs[subgraph_idx].size() >= min_subgraph_size) {
InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]); InsertNewNode(graph, subgraph_idx, subgraphs[subgraph_idx]);
} }
......
...@@ -36,8 +36,8 @@ std::vector<std::string> AddFCDesc( ...@@ -36,8 +36,8 @@ std::vector<std::string> AddFCDesc(
const std::shared_ptr<Scope>& scope, const std::shared_ptr<Scope>& scope,
const std::vector<std::string>& input_var_names, const std::vector<std::string>& input_var_names,
const std::vector<int64_t>& wshape) { const std::vector<int64_t>& wshape) {
CHECK_EQ(input_var_names.size(), 1); CHECK_EQ(input_var_names.size(), 1u);
CHECK_EQ(wshape.size(), 2); CHECK_EQ(wshape.size(), 2u);
static int id = 0; static int id = 0;
std::string prefix = "fc_" + paddle::lite::to_string(id); std::string prefix = "fc_" + paddle::lite::to_string(id);
auto* op_desc = block_desc->AddOp<cpp::OpDesc>(); auto* op_desc = block_desc->AddOp<cpp::OpDesc>();
...@@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) { ...@@ -169,8 +169,8 @@ TEST(Subgraph, detect_simple_model) {
}; };
std::vector<std::vector<mir::Node*>> subgraphs = std::vector<std::vector<mir::Node*>> subgraphs =
mir::SubgraphDetector(graph.get(), teller)(); mir::SubgraphDetector(graph.get(), teller)();
ASSERT_EQ(subgraphs.size(), 1); ASSERT_EQ(subgraphs.size(), 1u);
ASSERT_EQ(graph->nodes().size(), 9); ASSERT_EQ(graph->nodes().size(), 9u);
mir::SubgraphVisualizer(graph.get(), subgraphs)(); mir::SubgraphVisualizer(graph.get(), subgraphs)();
} }
...@@ -221,7 +221,7 @@ TEST(Subgraph, detect_custom_model) { ...@@ -221,7 +221,7 @@ TEST(Subgraph, detect_custom_model) {
std::vector<std::vector<mir::Node*>> subgraphs = std::vector<std::vector<mir::Node*>> subgraphs =
mir::SubgraphDetector(graph.get(), teller)(); mir::SubgraphDetector(graph.get(), teller)();
mir::SubgraphVisualizer(graph.get(), subgraphs)(); mir::SubgraphVisualizer(graph.get(), subgraphs)();
ASSERT_EQ(subgraphs.size(), 1); ASSERT_EQ(subgraphs.size(), 1u);
} }
} // namespace lite } // namespace lite
......
...@@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -40,6 +40,22 @@ void NPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
fuser(); fuser();
} }
void APUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
std::unordered_set<std::string> supported_lists;
#define USE_SUBGRAPH_BRIDGE(op_type, target) \
supported_lists.insert(#op_type); \
LOG(INFO) << #op_type
#include "lite/kernels/apu/bridges/paddle_use_bridges.h"
#undef USE_SUBGRAPH_BRIDGE
auto teller = [&](Node* node) {
if (!node->IsStmt()) return false;
auto& stmt = node->AsStmt();
return supported_lists.count(stmt.op_type()) != 0;
};
SubgraphFuser fuser(graph.get(), teller, 1 /* min_subgraph_size */);
fuser();
}
void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { void XPUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return; if (!GetBoolFromEnv("XPU_ENABLE_XTCL")) return;
std::unordered_set<std::string> supported_lists; std::unordered_set<std::string> supported_lists;
...@@ -103,6 +119,8 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) { ...@@ -103,6 +119,8 @@ void MLUSubgraphPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass) REGISTER_MIR_PASS(npu_subgraph_pass, paddle::lite::mir::NPUSubgraphPass)
.BindTargets({TARGET(kNPU)}); .BindTargets({TARGET(kNPU)});
REGISTER_MIR_PASS(apu_subgraph_pass, paddle::lite::mir::APUSubgraphPass)
.BindTargets({TARGET(kAPU)});
REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass) REGISTER_MIR_PASS(xpu_subgraph_pass, paddle::lite::mir::XPUSubgraphPass)
.BindTargets({TARGET(kXPU)}); .BindTargets({TARGET(kXPU)});
REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass) REGISTER_MIR_PASS(bm_subgraph_pass, paddle::lite::mir::BMSubgraphPass)
......
...@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass { ...@@ -27,6 +27,11 @@ class NPUSubgraphPass : public ProgramPass {
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
}; };
class APUSubgraphPass : public ProgramPass {
public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override;
};
class XPUSubgraphPass : public ProgramPass { class XPUSubgraphPass : public ProgramPass {
public: public:
void Apply(const std::unique_ptr<SSAGraph>& graph) override; void Apply(const std::unique_ptr<SSAGraph>& graph) override;
......
...@@ -39,7 +39,7 @@ std::vector<std::vector<int64_t>> ShapeParsing(std::string text) { ...@@ -39,7 +39,7 @@ std::vector<std::vector<int64_t>> ShapeParsing(std::string text) {
std::vector<std::vector<int64_t>> shapes; std::vector<std::vector<int64_t>> shapes;
std::vector<std::string> shape_strings = Split(text, ":"); std::vector<std::string> shape_strings = Split(text, ":");
shapes.resize(shape_strings.size()); shapes.resize(shape_strings.size());
for (int i = 0; i < shape_strings.size(); i++) { for (size_t i = 0; i < shape_strings.size(); i++) {
std::vector<std::string> shape_nums = Split(shape_strings[i], ","); std::vector<std::string> shape_nums = Split(shape_strings[i], ",");
for (auto shape_num : shape_nums) { for (auto shape_num : shape_nums) {
shapes[i].push_back(atoi(shape_num.c_str())); shapes[i].push_back(atoi(shape_num.c_str()));
...@@ -66,7 +66,7 @@ void FillInputTensors( ...@@ -66,7 +66,7 @@ void FillInputTensors(
for (int j = 0; j < input_tensor_size; j++) { \ for (int j = 0; j < input_tensor_size; j++) { \
input_tensor_data[j] = static_cast<type>(value); \ input_tensor_data[j] = static_cast<type>(value); \
} }
for (int i = 0; i < input_tensor_shape.size(); i++) { for (size_t i = 0; i < input_tensor_shape.size(); i++) {
auto input_tensor = predictor->GetInput(i); auto input_tensor = predictor->GetInput(i);
input_tensor->Resize(input_tensor_shape[i]); input_tensor->Resize(input_tensor_shape[i]);
auto input_tensor_size = ShapeProduction(input_tensor->shape()); auto input_tensor_size = ShapeProduction(input_tensor->shape());
...@@ -95,7 +95,7 @@ void CheckOutputTensors( ...@@ -95,7 +95,7 @@ void CheckOutputTensors(
<< " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; \ << " abs_diff: " << abs_diff << " rel_diff: " << rel_diff; \
EXPECT_LT(rel_diff, 0.1); \ EXPECT_LT(rel_diff, 0.1); \
} }
for (int i = 0; i < output_tensor_type.size(); i++) { for (size_t i = 0; i < output_tensor_type.size(); i++) {
auto tar_output_tensor = tar_predictor->GetOutput(i); auto tar_output_tensor = tar_predictor->GetOutput(i);
auto ref_output_tensor = ref_predictor->GetOutput(i); auto ref_output_tensor = ref_predictor->GetOutput(i);
auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape()); auto tar_output_tensor_size = ShapeProduction(tar_output_tensor->shape());
......
...@@ -25,23 +25,23 @@ namespace lite { ...@@ -25,23 +25,23 @@ namespace lite {
bool OpLite::InferShape() { bool OpLite::InferShape() {
// if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_ // if input_tensor_ptrs and output_tensor_ptrs are overloaded in param_
// InferShapeByMemoryInternal will be applied. // InferShapeByMemoryInternal will be applied.
if (param_.input_tensor_ptrs() && param_.output_tensor_ptrs()) { if (op_param_ && op_param_->input_tensor_ptrs() &&
op_param_->output_tensor_ptrs()) {
return this->InferShapeWithCache(); return this->InferShapeWithCache();
} else { } else {
// otherwise, InferShapeImpl is applied directly.
return this->InferShapeImpl(); return this->InferShapeImpl();
} }
} }
bool OpLite::InferShapeWithCache() { bool OpLite::InferShapeWithCache() {
// 1. Get vector of current input tensors // 1. Get vector of current input tensors
auto *current_inputs = param_.input_tensor_ptrs(); auto *current_inputs = op_param_->input_tensor_ptrs();
// 2. Get hash value of current inputs shape and lod // 2. Get hash value of current inputs shape and lod
size_t new_hash = 0; size_t new_hash = 0;
for (auto iter = current_inputs->begin(); iter != current_inputs->end(); for (auto iter = current_inputs->begin(); iter != current_inputs->end();
iter++) { iter++) {
// combined dims value into new_hash value. // combined dims value into new_hash value.
auto &element_dims = (*iter)->dims(); auto &element_dims = (*iter)->dims();
for (int i = 0; i < element_dims.size(); i++) { for (size_t i = 0; i < element_dims.size(); i++) {
new_hash = new_hash =
lite::hash_combine(new_hash, static_cast<int>(element_dims[i])); lite::hash_combine(new_hash, static_cast<int>(element_dims[i]));
} }
...@@ -49,7 +49,7 @@ bool OpLite::InferShapeWithCache() { ...@@ -49,7 +49,7 @@ bool OpLite::InferShapeWithCache() {
auto &emement_lods = (*iter)->lod(); auto &emement_lods = (*iter)->lod();
for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end(); for (auto lod_iter = emement_lods.begin(); lod_iter != emement_lods.end();
lod_iter++) { lod_iter++) {
for (int i = 0; i < lod_iter->size(); i++) { for (size_t i = 0; i < lod_iter->size(); i++) {
new_hash = new_hash =
lite::hash_combine(new_hash, static_cast<int>(lod_iter->at(i))); lite::hash_combine(new_hash, static_cast<int>(lod_iter->at(i)));
} }
...@@ -59,8 +59,8 @@ bool OpLite::InferShapeWithCache() { ...@@ -59,8 +59,8 @@ bool OpLite::InferShapeWithCache() {
if (new_hash == io_shape_lod_hash_ && new_hash != 0) { if (new_hash == io_shape_lod_hash_ && new_hash != 0) {
// if current hash value is consistent with io_shape_lod_hash_, // if current hash value is consistent with io_shape_lod_hash_,
// previous outputs shape and lod are reused. // previous outputs shape and lod are reused.
auto *current_outputs = param_.output_tensor_ptrs(); auto *current_outputs = op_param_->output_tensor_ptrs();
for (int i = 0; i < current_outputs->size(); i++) { for (size_t i = 0; i < current_outputs->size(); i++) {
current_outputs->at(i)->Resize(last_output_shapes[i]); current_outputs->at(i)->Resize(last_output_shapes[i]);
current_outputs->at(i)->set_lod(last_output_lods[i]); current_outputs->at(i)->set_lod(last_output_lods[i]);
} }
...@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() { ...@@ -68,10 +68,12 @@ bool OpLite::InferShapeWithCache() {
// otherwise, current hash value is changed, InferShapeImpl will apply. // otherwise, current hash value is changed, InferShapeImpl will apply.
io_shape_lod_hash_ = new_hash; io_shape_lod_hash_ = new_hash;
this->InferShapeImpl(); this->InferShapeImpl();
auto *current_outputs = param_.output_tensor_ptrs(); auto *current_outputs = op_param_->output_tensor_ptrs();
for (int i = 0; i < current_outputs->size(); i++) { last_output_shapes.clear();
last_output_shapes[i] = current_outputs->at(i)->dims(); last_output_lods.clear();
last_output_lods[i] = current_outputs->at(i)->lod(); for (size_t i = 0; i < current_outputs->size(); i++) {
last_output_shapes.push_back(current_outputs->at(i)->dims());
last_output_lods.push_back(current_outputs->at(i)->lod());
} }
} }
return true; return true;
......
...@@ -77,6 +77,11 @@ class OpLite : public Registry { ...@@ -77,6 +77,11 @@ class OpLite : public Registry {
// Link the external execution environ to internal context. // Link the external execution environ to internal context.
bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope); bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
template <typename T>
inline void AttachParam(T *param) {
op_param_ = static_cast<T *>(param);
}
const OpInfo *op_info() const { return op_info_.get(); } const OpInfo *op_info() const { return op_info_.get(); }
OpInfo *mutable_op_info() { return op_info_.get(); } OpInfo *mutable_op_info() { return op_info_.get(); }
...@@ -167,11 +172,10 @@ class OpLite : public Registry { ...@@ -167,11 +172,10 @@ class OpLite : public Registry {
std::vector<Place> valid_places_; std::vector<Place> valid_places_;
Place kernel_place_{TARGET(kHost), PRECISION(kFloat)}; Place kernel_place_{TARGET(kHost), PRECISION(kFloat)};
std::unique_ptr<OpInfo> op_info_; std::unique_ptr<OpInfo> op_info_;
std::vector<DDimLite> last_output_shapes{}; std::vector<DDimLite> last_output_shapes{};
std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{}; std::vector<std::vector<std::vector<uint64_t>>> last_output_lods{};
size_t io_shape_lod_hash_{}; size_t io_shape_lod_hash_{};
mutable operators::ParamBase param_; mutable operators::ParamBase *op_param_{nullptr};
private: private:
// Infer Shape according to memory, if current input shapes are consistent // Infer Shape according to memory, if current input shapes are consistent
......
...@@ -98,6 +98,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create( ...@@ -98,6 +98,9 @@ std::list<std::unique_ptr<KernelBase>> KernelRegistry::Create(
case TARGET(kNPU): { case TARGET(kNPU): {
CREATE_KERNEL(kNPU); CREATE_KERNEL(kNPU);
} break; } break;
case TARGET(kAPU): {
CREATE_KERNEL(kAPU);
} break;
case TARGET(kXPU): { case TARGET(kXPU): {
CREATE_KERNEL(kXPU); CREATE_KERNEL(kXPU);
} break; } break;
...@@ -220,6 +223,7 @@ KernelRegistry::KernelRegistry() ...@@ -220,6 +223,7 @@ KernelRegistry::KernelRegistry()
INIT_FOR(kNPU, kAny, kNHWC); INIT_FOR(kNPU, kAny, kNHWC);
INIT_FOR(kNPU, kAny, kAny); INIT_FOR(kNPU, kAny, kAny);
INIT_FOR(kAPU, kInt8, kNCHW);
INIT_FOR(kXPU, kFloat, kNCHW); INIT_FOR(kXPU, kFloat, kNCHW);
INIT_FOR(kXPU, kInt8, kNCHW); INIT_FOR(kXPU, kInt8, kNCHW);
INIT_FOR(kXPU, kAny, kNCHW); INIT_FOR(kXPU, kAny, kNCHW);
......
...@@ -111,18 +111,23 @@ class KernelRegistry final { ...@@ -111,18 +111,23 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNHWC)> *, // DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kCUDA), KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNHWC)> *, // DATALAYOUT(kNHWC)> *, //
KernelRegistryForTarget<TARGET(kX86), KernelRegistryForTarget<TARGET(kX86),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kX86), KernelRegistryForTarget<TARGET(kX86),
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kHost), KernelRegistryForTarget<TARGET(kHost),
PRECISION(kFloat), PRECISION(kFloat),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
...@@ -141,9 +146,7 @@ class KernelRegistry final { ...@@ -141,9 +146,7 @@ class KernelRegistry final {
KernelRegistryForTarget<TARGET(kHost), KernelRegistryForTarget<TARGET(kHost),
PRECISION(kInt64), PRECISION(kInt64),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kCUDA),
PRECISION(kAny),
DATALAYOUT(kAny)> *, //
KernelRegistryForTarget<TARGET(kARM), KernelRegistryForTarget<TARGET(kARM),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny)> *, // DATALAYOUT(kAny)> *, //
...@@ -231,6 +234,9 @@ class KernelRegistry final { ...@@ -231,6 +234,9 @@ class KernelRegistry final {
PRECISION(kInt8), PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, // DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kAPU),
PRECISION(kInt8),
DATALAYOUT(kNCHW)> *, //
KernelRegistryForTarget<TARGET(kXPU), KernelRegistryForTarget<TARGET(kXPU),
PRECISION(kAny), PRECISION(kAny),
DATALAYOUT(kAny)> *, // DATALAYOUT(kAny)> *, //
...@@ -445,32 +451,31 @@ class KernelRegistor : public lite::Registor<KernelType> { ...@@ -445,32 +451,31 @@ class KernelRegistor : public lite::Registor<KernelType> {
#define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \ #define LITE_KERNEL_REGISTER_FAKE(op_type__, target__, precision__, alias__) \
LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__) LITE_KERNEL_REGISTER_INSTANCE(op_type__, target__, precision__, alias__)
#define REGISTER_LITE_KERNEL( \ #define REGISTER_LITE_KERNEL( \
op_type__, target__, precision__, layout__, KernelClass, alias__) \ op_type__, target__, precision__, layout__, KernelClass, alias__) \
static paddle::lite::KernelRegistor<TARGET(target__), \ static paddle::lite::KernelRegistor<TARGET(target__), \
PRECISION(precision__), \ PRECISION(precision__), \
DATALAYOUT(layout__), \ DATALAYOUT(layout__), \
KernelClass> \ KernelClass> \
LITE_KERNEL_REGISTER_INSTANCE( \ LITE_KERNEL_REGISTER_INSTANCE( \
op_type__, target__, precision__, layout__, alias__)(#op_type__, \ op_type__, target__, precision__, layout__, alias__)(#op_type__, \
#alias__); \ #alias__); \
static KernelClass LITE_KERNEL_INSTANCE( \ static KernelClass LITE_KERNEL_INSTANCE( \
op_type__, target__, precision__, layout__, alias__); \ op_type__, target__, precision__, layout__, alias__); \
int touch_##op_type__##target__##precision__##layout__##alias__() { \ int touch_##op_type__##target__##precision__##layout__##alias__() { \
OpKernelInfoCollector::Global().AddKernel2path( \ OpKernelInfoCollector::Global().AddKernel2path( \
#op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \ #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__, \
__FILE__); \ __FILE__); \
LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \ LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__) \
.Touch(); \ .Touch(); \
return 0; \ return 0; \
} \ } \
static bool LITE_KERNEL_PARAM_INSTANCE( \ static bool LITE_KERNEL_PARAM_INSTANCE( \
op_type__, target__, precision__, layout__, alias__) \ op_type__, target__, precision__, layout__, alias__) UNUSED = \
__attribute__((unused)) = \ paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__), \
paddle::lite::ParamTypeRegistry::NewInstance<TARGET(target__), \ PRECISION(precision__), \
PRECISION(precision__), \ DATALAYOUT(layout__)>( \
DATALAYOUT(layout__)>( \ #op_type__ "/" #alias__)
#op_type__ "/" #alias__)
#define LITE_KERNEL_INSTANCE( \ #define LITE_KERNEL_INSTANCE( \
op_type__, target__, precision__, layout__, alias__) \ op_type__, target__, precision__, layout__, alias__) \
......
...@@ -101,6 +101,7 @@ class Optimizer { ...@@ -101,6 +101,7 @@ class Optimizer {
"npu_subgraph_pass", "npu_subgraph_pass",
"xpu_subgraph_pass", "xpu_subgraph_pass",
"bm_subgraph_pass", "bm_subgraph_pass",
"apu_subgraph_pass",
"rknpu_subgraph_pass", "rknpu_subgraph_pass",
"static_kernel_pick_pass", // pick original kernel from graph "static_kernel_pick_pass", // pick original kernel from graph
"variable_place_inference_pass", // inference arg/var's "variable_place_inference_pass", // inference arg/var's
......
...@@ -72,7 +72,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) { ...@@ -72,7 +72,7 @@ void RuntimeProgram::UpdateVarsOfProgram(cpp::ProgramDesc* desc) {
std::unordered_map<std::string, cpp::VarDesc> origin_var_maps; std::unordered_map<std::string, cpp::VarDesc> origin_var_maps;
auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0); auto& main_block = *desc->GetBlock<cpp::BlockDesc>(0);
auto var_size = main_block.VarsSize(); auto var_size = main_block.VarsSize();
for (int i = 0; i < var_size; i++) { for (size_t i = 0; i < var_size; i++) {
auto v = main_block.GetVar<cpp::VarDesc>(i); auto v = main_block.GetVar<cpp::VarDesc>(i);
auto name = v->Name(); auto name = v->Name();
origin_var_maps.emplace(name, *v); origin_var_maps.emplace(name, *v);
......
...@@ -100,7 +100,7 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) { ...@@ -100,7 +100,7 @@ void *TensorLite::mutable_data(TargetType target, size_t memory_size) {
void TensorLite::ResetBuffer(std::shared_ptr<Buffer> buffer, void TensorLite::ResetBuffer(std::shared_ptr<Buffer> buffer,
size_t memory_size) { size_t memory_size) {
CHECK_EQ(offset_, 0) CHECK_EQ(offset_, 0u)
<< "Only the offset is supported to zero when the Buffer is reset."; << "Only the offset is supported to zero when the Buffer is reset.";
if (buffer_) { if (buffer_) {
CHECK_LE(memory_size_, buffer->space()) CHECK_LE(memory_size_, buffer->space())
......
...@@ -30,7 +30,7 @@ namespace core { ...@@ -30,7 +30,7 @@ namespace core {
// TODO(Superjomn) unify all the type representation across the lite framework. // TODO(Superjomn) unify all the type representation across the lite framework.
enum class Type { enum class Type {
UNK = -1, UNK = -1,
// primary types // primary typesINT32,
INT32, INT32,
INT64, INT64,
FLOAT32, FLOAT32,
...@@ -92,6 +92,8 @@ Type StdTypeToRepr<float>(); ...@@ -92,6 +92,8 @@ Type StdTypeToRepr<float>();
template <> template <>
Type StdTypeToRepr<bool>(); Type StdTypeToRepr<bool>();
template <> template <>
Type StdTypeToRepr<double>();
template <>
Type StdTypeToRepr<std::vector<char>>(); Type StdTypeToRepr<std::vector<char>>();
template <> template <>
Type StdTypeToRepr<std::string>(); Type StdTypeToRepr<std::string>();
......
...@@ -18,6 +18,11 @@ ...@@ -18,6 +18,11 @@
#include "paddle_api.h" // NOLINT #include "paddle_api.h" // NOLINT
#include "paddle_use_passes.h" // NOLINT #include "paddle_use_passes.h" // NOLINT
#if defined(_WIN32)
#include "paddle_use_kernels.h" // NOLINT
#include "paddle_use_ops.h" // NOLINT
#endif
using namespace paddle::lite_api; // NOLINT using namespace paddle::lite_api; // NOLINT
DEFINE_string(model_dir, "", "Model dir path."); DEFINE_string(model_dir, "", "Model dir path.");
......
...@@ -23,7 +23,7 @@ import argparse ...@@ -23,7 +23,7 @@ import argparse
import sys import sys
sys.path.append('../../python/lib') sys.path.append('../../python/lib')
from lite_core import * from paddlelite.lite import *
# Command arguments # Command arguments
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
......
...@@ -23,7 +23,7 @@ import argparse ...@@ -23,7 +23,7 @@ import argparse
import sys import sys
sys.path.append('../../python/lib') sys.path.append('../../python/lib')
from lite_core import * from paddlelite.lite import *
# Command arguments # Command arguments
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h
#include "lite/fluid/data_type.h" #include "lite/fluid/data_type.h"
#include <stdint.h> #include <stdint.h>
#include <string> #include <string>
......
...@@ -11,5 +11,6 @@ add_subdirectory(fpga) ...@@ -11,5 +11,6 @@ add_subdirectory(fpga)
add_subdirectory(npu) add_subdirectory(npu)
add_subdirectory(xpu) add_subdirectory(xpu)
add_subdirectory(mlu) add_subdirectory(mlu)
add_subdirectory(apu)
add_subdirectory(bm) add_subdirectory(bm)
add_subdirectory(rknpu) add_subdirectory(rknpu)
add_subdirectory(bridges)
add_kernel(subgraph_compute_apu APU basic SRCS subgraph_compute.cc DEPS ${lite_kernel_deps} device_apu subgraph_bridge_engine ${apu_subgraph_bridges})
if(NOT LITE_WITH_APU)
return()
endif()
lite_cc_library(subgraph_bridge_utility_apu SRCS utility.cc DEPS tensor)
lite_cc_library(subgraph_bridge_graph_apu SRCS graph.cc DEPS subgraph_bridge_utility_apu)
set(apu_subgraph_bridge_deps subgraph_bridge_registry subgraph_bridge_utility_apu subgraph_bridge_graph_apu)
lite_cc_library(subgraph_bridge_conv_op_apu SRCS conv_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_elementwise_ops_apu SRCS elementwise_ops.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_act_op_apu SRCS act_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_pool_op_apu SRCS pool_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_softmax_op_apu SRCS softmax_op.cc DEPS ${apu_subgraph_bridge_deps})
lite_cc_library(subgraph_bridge_fc_op_apu SRCS fc_op.cc DEPS ${apu_subgraph_bridge_deps})
set(apu_subgraph_bridges
subgraph_bridge_registry
subgraph_bridge_utility_apu
subgraph_bridge_conv_op_apu
subgraph_bridge_elementwise_ops_apu
subgraph_bridge_act_op_apu
subgraph_bridge_softmax_op_apu
subgraph_bridge_fc_op_apu
subgraph_bridge_pool_op_apu
CACHE INTERNAL "apu_subgraph_bridges")
message(STATUS "+++++ apu_subgraph_bridges: ${apu_subgraph_bridges}")
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/kernels/apu/bridges/utility.h"
#include "lite/kernels/npu/bridges/registry.h"
namespace paddle {
namespace lite {
namespace subgraph {
namespace apu {
int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
CHECK(ctx != nullptr);
CHECK(op != nullptr);
auto op_info = op->op_info();
auto op_type = op_info->Type();
auto scope = op->scope();
VLOG(3) << "[APU] Converting " + op_type + "...";
// Get input and output vars and op attributes
auto x_name = op_info->Input("X").front();
auto x = scope->FindMutableTensor(x_name);
auto x_dims = x->dims();
auto out_name = op_info->Output("Out").front();
return SUCCESS;
}
} // namespace apu
} // namespace subgraph
} // namespace lite
} // namespace paddle
REGISTER_SUBGRAPH_BRIDGE(relu, kAPU, paddle::lite::subgraph::apu::ActConverter);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册